feat: introducing docling_backend (#26)

Uses our own docling_parse to reliably get PDF cells To get page images, this backend uses pypdfium2 Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2024-08-07 16:22:36 +02:00
parent 62ba4aaf31
commit b8f5e38a8c
4 changed files with 203 additions and 6 deletions
--- a/examples/convert.py
+++ b/examples/convert.py
@@ -4,7 +4,8 @@ import time
 from pathlib import Path
 from typing import Iterable

-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
 from docling.document_converter import DocumentConverter
@@ -54,11 +55,12 @@ def main():
    artifacts_path = DocumentConverter.download_models_hf()

    pipeline_options = PipelineOptions(do_table_structure=True)
-    # use text cells predicted from table structure model, instead of matching with pdf cells
-    pipeline_options.table_structure_options.do_cell_matching = False
+    pipeline_options.table_structure_options.do_cell_matching = True

    doc_converter = DocumentConverter(
-        artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        artifacts_path=artifacts_path,
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
    )

    input = DocumentConversionInput.from_paths(input_doc_paths)