feat: introducing docling_backend (#26)

Uses our own docling_parse to reliably get PDF cells
To get page images, this backend uses pypdfium2

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2024-08-07 16:22:36 +02:00
committed by GitHub
parent 62ba4aaf31
commit b8f5e38a8c
4 changed files with 203 additions and 6 deletions

View File

@@ -4,7 +4,8 @@ import time
from pathlib import Path
from typing import Iterable
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter
@@ -54,11 +55,12 @@ def main():
artifacts_path = DocumentConverter.download_models_hf()
pipeline_options = PipelineOptions(do_table_structure=True)
# use text cells predicted from table structure model, instead of matching with pdf cells
pipeline_options.table_structure_options.do_cell_matching = False
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
artifacts_path=artifacts_path, pipeline_options=pipeline_options
artifacts_path=artifacts_path,
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
input = DocumentConversionInput.from_paths(input_doc_paths)