mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
ci: add coverage and ruff (#1383)
* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -383,7 +383,7 @@
|
||||
"\n",
|
||||
"print(f\"Downloading {url}...\")\n",
|
||||
"buf = BytesIO(requests.get(url).content)\n",
|
||||
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||
"with zipfile.ZipFile(buf) as zf:\n",
|
||||
" res = zf.testzip()\n",
|
||||
" if res:\n",
|
||||
@@ -544,7 +544,7 @@
|
||||
"source": [
|
||||
"doc = backend.convert()\n",
|
||||
"\n",
|
||||
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
|
||||
"claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
|
||||
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import yaml
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
@@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -3,7 +3,6 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.ocr_mac_model import OcrMacOptions
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
# It does not run the actual formula understanding model.
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
||||
|
||||
@@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
|
||||
|
||||
# How the pipeline can be extended.
|
||||
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
|
||||
|
||||
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
|
||||
@@ -85,7 +84,7 @@ def main():
|
||||
)
|
||||
}
|
||||
)
|
||||
result = doc_converter.convert(input_doc_path)
|
||||
doc_converter.convert(input_doc_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
# It does not run the actual picture classifier model.
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
from typing import Any
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
|
||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@@ -51,7 +51,6 @@ def main():
|
||||
page_segments,
|
||||
page,
|
||||
) in generate_multimodal_pages(conv_res):
|
||||
|
||||
dpi = page._default_image_scale * 72
|
||||
|
||||
rows.append(
|
||||
@@ -81,10 +80,10 @@ def main():
|
||||
)
|
||||
|
||||
# Generate one parquet from all documents
|
||||
df = pd.json_normalize(rows)
|
||||
df_result = pd.json_normalize(rows)
|
||||
now = datetime.datetime.now()
|
||||
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||
df.to_parquet(output_filename)
|
||||
df_result.to_parquet(output_filename)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
|
||||
@@ -32,12 +32,12 @@ def main():
|
||||
print(table_df.to_markdown())
|
||||
|
||||
# Save the table as csv
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html(doc=conv_res.document))
|
||||
|
||||
@@ -1,14 +1,9 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@@ -153,10 +153,10 @@
|
||||
"source": [
|
||||
"for i, chunk in enumerate(chunk_iter):\n",
|
||||
" print(f\"=== {i} ===\")\n",
|
||||
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
|
||||
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
|
||||
"\n",
|
||||
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
||||
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
|
||||
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
|
||||
"\n",
|
||||
" print()"
|
||||
]
|
||||
@@ -353,11 +353,11 @@
|
||||
"for i, chunk in enumerate(chunks):\n",
|
||||
" print(f\"=== {i} ===\")\n",
|
||||
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
|
||||
"\n",
|
||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
||||
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
|
||||
"\n",
|
||||
" print()"
|
||||
]
|
||||
|
||||
@@ -2,17 +2,14 @@ import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
VlmPipelineOptions,
|
||||
granite_vision_vlm_conversion_options,
|
||||
smoldocling_vlm_conversion_options,
|
||||
smoldocling_vlm_mlx_conversion_options,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
@@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
|
||||
## Alternative VLM models:
|
||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
## Set up pipeline for PDF or image inputs
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
@@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
|
||||
for source in sources:
|
||||
start_time = time.time()
|
||||
print("================================================")
|
||||
print("Processing... {}".format(source))
|
||||
print(f"Processing... {source}")
|
||||
print("================================================")
|
||||
print("")
|
||||
|
||||
@@ -77,7 +71,7 @@ for source in sources:
|
||||
print(page.predictions.vlm_response.text)
|
||||
|
||||
res.document.save_as_html(
|
||||
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
|
||||
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
|
||||
image_mode=ImageRefMode.REFERENCED,
|
||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||
)
|
||||
|
||||
@@ -144,7 +144,7 @@
|
||||
"for pic in doc.pictures[:5]:\n",
|
||||
" html_item = (\n",
|
||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
||||
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||
" )\n",
|
||||
" for annotation in pic.annotations:\n",
|
||||
@@ -252,7 +252,7 @@
|
||||
"for pic in doc.pictures[:5]:\n",
|
||||
" html_item = (\n",
|
||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
||||
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||
" )\n",
|
||||
" for annotation in pic.annotations:\n",
|
||||
|
||||
@@ -283,7 +283,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -369,7 +369,7 @@
|
||||
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
|
||||
" try:\n",
|
||||
" index_client.delete_index(index_name)\n",
|
||||
" except:\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" index_client.create_or_update_index(new_index)\n",
|
||||
@@ -487,7 +487,7 @@
|
||||
"\n",
|
||||
" all_succeeded = all(r.succeeded for r in resp)\n",
|
||||
" console.print(\n",
|
||||
" f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
|
||||
" f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
|
||||
" f\"first_doc_status_code: {resp[0].status_code}\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
@@ -807,10 +807,12 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from typing import Optional\n",
|
||||
"\n",
|
||||
"from azure.search.documents.models import VectorizableTextQuery\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def generate_chat_response(prompt: str, system_message: str = None):\n",
|
||||
"def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
|
||||
" \"\"\"\n",
|
||||
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
|
||||
" If you need multi-turn conversation or follow-up queries, you'll have to\n",
|
||||
|
||||
@@ -351,7 +351,7 @@
|
||||
"for source in sources:\n",
|
||||
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
||||
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
|
||||
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
|
||||
" print(f\"- text: {doc_chunk.text!r}\")\n",
|
||||
" if doc_chunk.meta.origin:\n",
|
||||
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
|
||||
" if doc_chunk.meta.headings:\n",
|
||||
|
||||
@@ -341,7 +341,7 @@
|
||||
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
|
||||
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
|
||||
" print()\n",
|
||||
" print(f\"Source {i+1}:\")\n",
|
||||
" print(f\"Source {i + 1}:\")\n",
|
||||
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
||||
" for key in doc.metadata:\n",
|
||||
" if key != \"pk\":\n",
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"id": "u076oUSF_YUG"
|
||||
@@ -72,12 +72,11 @@
|
||||
"%pip install rich\n",
|
||||
"%pip install torch\n",
|
||||
"\n",
|
||||
"import logging\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"warnings.filterwarnings(\"ignore\")\n",
|
||||
"\n",
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"# Suppress Weaviate client logs\n",
|
||||
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
||||
]
|
||||
@@ -119,7 +118,7 @@
|
||||
" device = torch.device(\"mps\")\n",
|
||||
" print(\"MPS GPU is enabled.\")\n",
|
||||
"else:\n",
|
||||
" raise EnvironmentError(\n",
|
||||
" raise OSError(\n",
|
||||
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
||||
" )"
|
||||
]
|
||||
@@ -226,7 +225,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from docling.datamodel.document import ConversionResult\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# Instantiate the doc converter\n",
|
||||
@@ -345,7 +343,7 @@
|
||||
"\n",
|
||||
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
||||
" if not openai_api_key:\n",
|
||||
" raise EnvironmentError(\n",
|
||||
" raise OSError(\n",
|
||||
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
||||
" \"Please define it before running this script.\"\n",
|
||||
" )"
|
||||
@@ -387,7 +385,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import weaviate.classes.config as wc\n",
|
||||
"from weaviate.classes.config import DataType, Property\n",
|
||||
"\n",
|
||||
"# Define the collection name\n",
|
||||
"collection_name = \"docling\"\n",
|
||||
|
||||
@@ -25,9 +25,7 @@ def main():
|
||||
document = mdb.convert()
|
||||
|
||||
out_path = Path("scratch")
|
||||
print(
|
||||
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
|
||||
|
||||
# Export Docling document format to markdowndoc:
|
||||
fn = os.path.basename(path)
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
@@ -63,7 +63,7 @@ def main():
|
||||
out_path = Path("scratch")
|
||||
print(
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
f"\nSaved markdown output to: {out_path!s}"
|
||||
)
|
||||
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||
# Export Docling document format to markdowndoc:
|
||||
|
||||
@@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@@ -2,9 +2,9 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
|
||||
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
@@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
|
||||
# FIXME: put in your favorite translation code ....
|
||||
def translate(text: str, src: str = "en", dest: str = "de"):
|
||||
|
||||
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
|
||||
# from googletrans import Translator
|
||||
|
||||
@@ -52,10 +51,9 @@ def main():
|
||||
}
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
conv_doc = conv_res.document
|
||||
doc_filename = conv_res.input.file
|
||||
|
||||
# Save markdown with embedded pictures in original text
|
||||
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
||||
|
||||
@@ -432,7 +432,7 @@
|
||||
"\n",
|
||||
"for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
|
||||
" image_by_page = {}\n",
|
||||
" print(f\"Source {i+1}:\")\n",
|
||||
" print(f\"Source {i + 1}:\")\n",
|
||||
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
||||
" meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
|
||||
"\n",
|
||||
|
||||
@@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
|
||||
ApiVlmOptions,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
granite_vision_vlm_ollama_conversion_options,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
Reference in New Issue
Block a user