diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index e04e2803..19e8c1e1 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -28,7 +28,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then echo "Skipping $file" continue fi diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 27a368f9..c905203e 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -11,7 +11,7 @@ from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell, Size +from docling.datamodel.base_models import Cell, Size, TextDirection if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -97,6 +97,7 @@ class DoclingParseV2PageBackend(PdfPageBackend): y0 = cell_data[cells_header.index("y0")] x1 = cell_data[cells_header.index("x1")] y1 = cell_data[cells_header.index("y1")] + ltr = cell_data[cells_header.index("left_to_right")] if x1 < x0: x0, x1 = x1, x0 @@ -116,6 +117,11 @@ class DoclingParseV2PageBackend(PdfPageBackend): t=y1 * page_size.height / parser_height, coord_origin=CoordOrigin.BOTTOMLEFT, ).to_top_left_origin(page_size.height), + text_direction=( + TextDirection.LEFT_TO_RIGHT + if ltr + else TextDirection.RIGHT_TO_LEFT + ), ) ) cell_counter += 1 diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index d1e7ce3a..6bd5b61d 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -120,10 +120,16 @@ class ErrorItem(BaseModel): error_message: str +class TextDirection(str, Enum): + LEFT_TO_RIGHT = "left_to_right" + RIGHT_TO_LEFT = "right_to_left" + + class Cell(BaseModel): id: int text: str bbox: BoundingBox + text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT class OcrCell(Cell): diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 6f7de07a..013b89f5 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -264,6 +264,7 @@ class GlmModel: glm_doc = self.model.apply_on_doc(ds_doc_dict) docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental + 1 == 1 # DEBUG code: def draw_clusters_and_cells(ds_document, page_no, show: bool = False): diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 4acf8c95..38b23215 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -11,6 +11,7 @@ from docling.datamodel.base_models import ( Page, PageElement, Table, + TextDirection, TextElement, ) from docling.datamodel.document import ConversionResult @@ -75,12 +76,23 @@ class PageAssembleModel(BasePageModel): for cluster in page.predictions.layout.clusters: # _log.info("Cluster label seen:", cluster.label) if cluster.label in LayoutModel.TEXT_ELEM_LABELS: + textlines = [] + + dominant_text_direction = TextDirection.LEFT_TO_RIGHT + + # Naive code: dominant text direction == direction of first cell. + for cell in cluster.cells: + dominant_text_direction = cell.text_direction + break + + for cell in cluster.cells: + text = cell.text.replace("\x02", "-").strip() + if text: + # if dominant_text_direction == TextDirection.RIGHT_TO_LEFT: + # textlines.insert(0, text) # Prepend RTL text + # else: + textlines.append(text) # Append LTR text - textlines = [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] text = self.sanitize_text(textlines) text_el = TextElement( label=cluster.label, diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index f6ad92bd..4e00e353 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -49,6 +49,8 @@ def export_documents( with (output_dir / f"{doc_filename}.md").open("w") as fp: fp.write(conv_res.document.export_to_markdown()) + conv_res.document.save_as_html(output_dir / f"{doc_filename}.html") + # Export Docling document format to text: with (output_dir / f"{doc_filename}.txt").open("w") as fp: fp.write(conv_res.document.export_to_markdown(strict_text=True)) @@ -103,10 +105,13 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - Path("./tests/data/2203.01017v2.pdf"), - Path("./tests/data/2305.03393v1.pdf"), - Path("./tests/data/redp5110_sampled.pdf"), + Path("./tests/data/pdf/right_to_left_01.pdf"), + Path("./tests/data/pdf/right_to_left_02.pdf"), + Path("./tests/data/pdf/right_to_left_03.pdf"), + Path("./tests/data/pdf/2206.01062.pdf"), + Path("./tests/data/pdf/2203.01017v2.pdf"), + Path("./tests/data/pdf/2305.03393v1.pdf"), + Path("./tests/data/pdf/redp5110_sampled.pdf"), ] # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) diff --git a/docs/examples/rapidocr_with_custom_models.py b/docs/examples/rapidocr_with_custom_models.py new file mode 100644 index 00000000..e6dd3963 --- /dev/null +++ b/docs/examples/rapidocr_with_custom_models.py @@ -0,0 +1,58 @@ +import os + +from huggingface_hub import snapshot_download + +from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions +from docling.document_converter import ( + ConversionResult, + DocumentConverter, + InputFormat, + PdfFormatOption, +) + + +def main(): + # Source document to convert + source = "https://arxiv.org/pdf/2408.09869v4" + + # Download RappidOCR models from HuggingFace + print("Downloading RapidOCR models") + download_path = snapshot_download(repo_id="SWHL/RapidOCR") + + # Setup RapidOcrOptions for english detection + det_model_path = os.path.join( + download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx" + ) + rec_model_path = os.path.join( + download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx" + ) + cls_model_path = os.path.join( + download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx" + ) + ocr_options = RapidOcrOptions( + det_model_path=det_model_path, + rec_model_path=rec_model_path, + cls_model_path=cls_model_path, + ) + + pipeline_options = PdfPipelineOptions( + ocr_options=ocr_options, + ) + + # Convert the document + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ), + }, + ) + + conversion_result: ConversionResult = converter.convert(source=source) + doc = conversion_result.document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 0fcc2ca4..abb93a27 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,6 +77,7 @@ nav: - "Multimodal export": examples/export_multimodal.py - "Force full page OCR": examples/full_page_ocr.py - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py + - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py - "Accelerator options": examples/run_with_accelerator.py - "Simple translation": examples/translate.py - examples/backend_xml_rag.ipynb diff --git a/poetry.lock b/poetry.lock index b8d8072c..dd475ff3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -927,45 +927,23 @@ name = "docling-parse" version = "3.2.0" description = "Simple package to extract text with coordinates from programmatic PDFs" optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_parse-3.2.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:16a43ab04f13f93602baf2b35dd96d4f2ac646732270e329d00ad6e1c714b013"}, - {file = "docling_parse-3.2.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:5fd77a9bb0cee57e9bee9de2f417c83ed50e49145c5ddb736801b68e004471e1"}, - {file = "docling_parse-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87dc9071aa9fabd586c36599d983cda55f9eab0ba7618009f52f8f537598a70c"}, - {file = "docling_parse-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5dc044ebad99808dd7a82fa2cf5beaf90f744522225bf6dce62200c810817c31"}, - {file = "docling_parse-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:e5048626ca165b24a35f417a58d3c4b3f936bc9d2271fc633153a98e788548cf"}, - {file = "docling_parse-3.2.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:f3616a3fb3dacc87307b6794dc81e6aec09f59afbb42f487b0100f39571e7442"}, - {file = "docling_parse-3.2.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:9df9d892d8206cc2a0091bb1e2e3ce6f3ab881342980c53bc7f4bf7c831bf057"}, - {file = "docling_parse-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e90e9c2b21e732b3e1af697abb5e7bca9c23a3be8e25a6cc4d92221d526953c"}, - {file = "docling_parse-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71595399283ecacb255cf1768d1840177f7c33aedd58195530505f9aa8cd5a24"}, - {file = "docling_parse-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0f2a7801d6fa843cd3243aab38cd944e2f4ff386fb5e6fb7b27be1dfa69845c7"}, - {file = "docling_parse-3.2.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:a83458692c607a393b2f5858d47509342274920553775db5c8d0072ad6aaa0fa"}, - {file = "docling_parse-3.2.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2d17bf3dffbc2fb565c5fa8347ae7715fc091f4f94228b4ece4f8ab5c3fb428a"}, - {file = "docling_parse-3.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f54881ebcd87384b29f7a6475b308034a9ecba0dfa85dd1d2569ef59e2f37e97"}, - {file = "docling_parse-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27de3b28946a615f16e6824bede965f8df5f3b2552e17560415922c79fa8546f"}, - {file = "docling_parse-3.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:20265d7d51b3f4a3bb03e3de694f395d0403bde92915eb5df32f4d67adf93477"}, - {file = "docling_parse-3.2.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:feecbef5b8593dcd9a35ceb06b29feb879515b5b900bcaa5d9be0c7a3a0ca599"}, - {file = "docling_parse-3.2.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:6fc65d623e80d8d63f4a6f542408b7f88c0dc8b2842c2523858536d4607d33d5"}, - {file = "docling_parse-3.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c06851f88b5edaa7115871100608b9d68bc804b28434b90e57879e178098aed2"}, - {file = "docling_parse-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a2fdb3993f7affc73ce19f1202a2f28f3d8cf1716163d9e978ee9c834312c31"}, - {file = "docling_parse-3.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:1b9e32989ff58e0bac85d6cffb3a523dd2373d350b26d46e1f8daff0110595fa"}, - {file = "docling_parse-3.2.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:ab9ee7a74585b69a1b81f34e71d77332f85e013b5d8b6477fd9bbec779f239cf"}, - {file = "docling_parse-3.2.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:ecb90d4b29f179b9afbe643c943f24b9eb065c8bf0af739bee1330e75973fcea"}, - {file = "docling_parse-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b7c0f7d8f7e103305a3c4bc1ae00faca9a8b16a2a3e34042cff1b9e13f922d9"}, - {file = "docling_parse-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8687ed6c3a299a46646e369f0bd072286372ecede54c8916b40acc6be6347f1a"}, - {file = "docling_parse-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:491589373188c2683fbaa0d366e4806258bc6bb8ab8d1d596144516662d92e90"}, - {file = "docling_parse-3.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0d73b0f920bd0940ea9daefb8c9d02aad86f62422c4e320e66dae74f07ef9888"}, - {file = "docling_parse-3.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:68bcf3dcd80afbbce128e7cb2d319842a6fa0b719ad61e4c6872ca32c5063a88"}, - {file = "docling_parse-3.2.0.tar.gz", hash = "sha256:d45e34860a2f845d1726d87af77e8deff17e0c6fb876707a7cf390492d408c2f"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -docling-core = ">=2.14.0,<3.0.0" -pillow = ">=10.4.0,<11.0.0" -pydantic = ">=2.10.5,<3.0.0" +docling-core = "^2.14.0" +pillow = "^10.4.0" +pydantic = "^2.10.5" pywin32 = {version = ">=305", markers = "sys_platform == \"win32\""} tabulate = ">=0.9.0,<1.0.0" +[package.source] +type = "git" +url = "https://github.com/DS4SD/docling-parse.git" +reference = "93e281576e740345d0161ad5da1b1fff815df8e4" +resolved_reference = "93e281576e740345d0161ad5da1b1fff815df8e4" + [[package]] name = "docutils" version = "0.21.2" @@ -7837,4 +7815,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632" +content-hash = "dcab39f8f6cc4a2b24e25774c147dce5eb9da775309d69a9304b72c47725b021" diff --git a/pyproject.toml b/pyproject.toml index 4baf50a7..821b1dc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ pydantic = "^2.0.0" docling-core = {extras = ["chunking"], version = "^2.17.0"} docling-ibm-models = "^3.3.0" deepsearch-glm = "^1.0.0" -docling-parse = "^3.1.0" +# docling-parse = "^3.1.0" +docling-parse = { git = "https://github.com/DS4SD/docling-parse.git", rev = "93e281576e740345d0161ad5da1b1fff815df8e4"} filetype = "^1.2.0" pypdfium2 = "^4.30.0" pydantic-settings = "^2.3.0" diff --git a/tests/data/test_01.asciidoc b/tests/data/asciidoc/test_01.asciidoc similarity index 100% rename from tests/data/test_01.asciidoc rename to tests/data/asciidoc/test_01.asciidoc diff --git a/tests/data/test_02.asciidoc b/tests/data/asciidoc/test_02.asciidoc similarity index 100% rename from tests/data/test_02.asciidoc rename to tests/data/asciidoc/test_02.asciidoc diff --git a/tests/data/2203.01017v2.pdf b/tests/data/pdf/2203.01017v2.pdf similarity index 100% rename from tests/data/2203.01017v2.pdf rename to tests/data/pdf/2203.01017v2.pdf diff --git a/tests/data/2206.01062.pdf b/tests/data/pdf/2206.01062.pdf similarity index 100% rename from tests/data/2206.01062.pdf rename to tests/data/pdf/2206.01062.pdf diff --git a/tests/data/2305.03393v1-pg9.pdf b/tests/data/pdf/2305.03393v1-pg9.pdf similarity index 100% rename from tests/data/2305.03393v1-pg9.pdf rename to tests/data/pdf/2305.03393v1-pg9.pdf diff --git a/tests/data/2305.03393v1.pdf b/tests/data/pdf/2305.03393v1.pdf similarity index 100% rename from tests/data/2305.03393v1.pdf rename to tests/data/pdf/2305.03393v1.pdf diff --git a/tests/data/amt_handbook_sample.pdf b/tests/data/pdf/amt_handbook_sample.pdf similarity index 100% rename from tests/data/amt_handbook_sample.pdf rename to tests/data/pdf/amt_handbook_sample.pdf diff --git a/tests/data/code_and_formula.pdf b/tests/data/pdf/code_and_formula.pdf similarity index 100% rename from tests/data/code_and_formula.pdf rename to tests/data/pdf/code_and_formula.pdf diff --git a/tests/data/picture_classification.pdf b/tests/data/pdf/picture_classification.pdf similarity index 100% rename from tests/data/picture_classification.pdf rename to tests/data/pdf/picture_classification.pdf diff --git a/tests/data/redp5110_sampled.pdf b/tests/data/pdf/redp5110_sampled.pdf similarity index 100% rename from tests/data/redp5110_sampled.pdf rename to tests/data/pdf/redp5110_sampled.pdf diff --git a/tests/data/pdf/right_to_left_01.pdf b/tests/data/pdf/right_to_left_01.pdf new file mode 100644 index 00000000..2d9bc2f6 Binary files /dev/null and b/tests/data/pdf/right_to_left_01.pdf differ diff --git a/tests/data/pdf/right_to_left_02.pdf b/tests/data/pdf/right_to_left_02.pdf new file mode 100644 index 00000000..e722b71d Binary files /dev/null and b/tests/data/pdf/right_to_left_02.pdf differ diff --git a/tests/data/pdf/right_to_left_03.pdf b/tests/data/pdf/right_to_left_03.pdf new file mode 100644 index 00000000..af329a9c Binary files /dev/null and b/tests/data/pdf/right_to_left_03.pdf differ diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index e4fae312..4574a228 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -20,7 +20,7 @@ def _get_backend(fname): def test_asciidocs_examples(): - fnames = sorted(glob.glob("./tests/data/*.asciidoc")) + fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc")) for fname in fnames: print(f"reading {fname}") diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index 66e7771d..3c214791 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -28,7 +28,7 @@ def _get_backend(pdf_doc): def test_text_cell_counts(): - pdf_doc = Path("./tests/data/redp5110_sampled.pdf") + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index 087272bf..d69b1101 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -26,8 +26,19 @@ def _get_backend(pdf_doc): return doc_backend +def test_cell_ordering(): + pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf") + doc_backend = _get_backend(pdf_doc) + + for page_index in range(0, doc_backend.page_count()): + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index) + cells = list(page_backend.get_text_cells()) + + 1 == 1 + + def test_text_cell_counts(): - pdf_doc = Path("./tests/data/redp5110_sampled.pdf") + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index b2a77dcd..10a2b9e7 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def _get_backend(pdf_doc): @@ -28,7 +28,7 @@ def _get_backend(pdf_doc): def test_text_cell_counts(): - pdf_doc = Path("./tests/data/redp5110_sampled.pdf") + pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf") doc_backend = _get_backend(pdf_doc) diff --git a/tests/test_cli.py b/tests/test_cli.py index 71d14457..4364df8b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,7 +18,7 @@ def test_cli_version(): def test_cli_convert(tmp_path): - source = "./tests/data/2305.03393v1-pg9.pdf" + source = "./tests/data/pdf/2305.03393v1-pg9.pdf" output = tmp_path / "out" output.mkdir() result = runner.invoke(app, [source, "--output", str(output)]) diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py index 34a2c278..ac7a1587 100644 --- a/tests/test_code_formula.py +++ b/tests/test_code_formula.py @@ -36,7 +36,7 @@ def get_converter(): def test_code_and_formula_conversion(): - pdf_path = Path("tests/data/code_and_formula.pdf") + pdf_path = Path("tests/data/pdf/code_and_formula.pdf") converter = get_converter() print(f"converting {pdf_path}") diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 0ad87e96..6ca54d63 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -37,7 +37,7 @@ def get_converter(): def test_picture_classifier(): - pdf_path = Path("tests/data/picture_classification.pdf") + pdf_path = Path("tests/data/pdf/picture_classification.pdf") converter = get_converter() print(f"converting {pdf_path}") diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 0c572595..d2215d61 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -15,7 +15,7 @@ GENERATE_V2 = False def get_pdf_paths(): # Define the directory you want to search - directory = Path("./tests/data") + directory = Path("./tests/data/pdf/") # List all PDF files in the directory and its subdirectories pdf_files = sorted(directory.rglob("*.pdf")) diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index efecb81e..c21b6c43 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -9,7 +9,7 @@ from docling.datamodel.settings import DocumentLimits def test_in_doc_from_valid_path(): - test_doc_path = Path("./tests/data/2206.01062.pdf") + test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") doc = _make_input_doc(test_doc_path) assert doc.valid == True @@ -24,7 +24,7 @@ def test_in_doc_from_invalid_path(): def test_in_doc_from_valid_buf(): - buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read()) + buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) @@ -41,7 +41,7 @@ def test_in_doc_from_invalid_buf(): def test_in_doc_with_page_range(): - test_doc_path = Path("./tests/data/2206.01062.pdf") + test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") limits = DocumentLimits() limits.page_range = (1, 10) @@ -81,10 +81,10 @@ def test_guess_format(tmp_path): temp_dir.mkdir() # Valid PDF - buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read()) + buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) assert dci._guess_format(stream) == InputFormat.PDF - doc_path = Path("./tests/data/2206.01062.pdf") + doc_path = Path("./tests/data/pdf/2206.01062.pdf") assert dci._guess_format(doc_path) == InputFormat.PDF # Valid MS Office diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 23bc3345..1978bc74 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -15,7 +15,7 @@ GENERATE = False def get_pdf_path(): - pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf") + pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index f40d79e4..68716cba 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -9,7 +9,7 @@ from docling.document_converter import ConversionError, DocumentConverter def get_pdf_path(): - pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf") + pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index 28800edd..215253d2 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -16,7 +16,7 @@ def test_doc_paths(): Path("tests/data/docx/lorem_ipsum.docx"), Path("tests/data/pptx/powerpoint_sample.pptx"), Path("tests/data/2305.03393v1-pg9-img.png"), - Path("tests/data/2206.01062.pdf"), + Path("tests/data/pdf/2206.01062.pdf"), ] diff --git a/tests/test_options.py b/tests/test_options.py index 1dd3bbc8..c8701a1b 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -17,7 +17,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption @pytest.fixture def test_doc_path(): - return Path("./tests/data/2206.01062.pdf") + return Path("./tests/data/pdf/2206.01062.pdf") def get_converters_with_table_options(): diff --git a/tests/verify_utils.py b/tests/verify_utils.py index c444266b..e179a40e 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -249,7 +249,13 @@ def verify_conversion_result_v1( doc_pred_dt = doc_result.legacy_document.export_to_document_tokens() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name + if str(input_path.parent).endswith("pdf"): + gt_subpath = ( + input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name + ) + pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json") json_path = gt_subpath.with_suffix(f"{engine_suffix}.json") md_path = gt_subpath.with_suffix(f"{engine_suffix}.md") @@ -325,7 +331,13 @@ def verify_conversion_result_v2( doc_pred_dt = doc_result.document.export_to_document_tokens() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name + if str(input_path.parent).endswith("pdf"): + gt_subpath = ( + input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name + ) + pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json") json_path = gt_subpath.with_suffix(f"{engine_suffix}.json") md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")