Fundamental refactoring for multi-format support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-01 16:27:22 +02:00
parent cd06d89c2a
commit 1fa7cd9855
34 changed files with 2102 additions and 365 deletions

Binary file not shown.

1311
tests/data/wiki_duck.html Normal file

File diff suppressed because one or more lines are too long

BIN
tests/data/word_sample.docx Normal file

Binary file not shown.

View File

@@ -2,9 +2,9 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.base_models import PdfPipelineOptions
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
from .verify_utils import verify_conversion_result
@@ -23,12 +23,12 @@ def get_pdf_paths():
def get_converter():
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

View File

@@ -5,9 +5,9 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, PipelineOptions
from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
from .verify_utils import verify_conversion_result
@@ -21,12 +21,12 @@ def get_pdf_path():
@pytest.fixture
def converter():
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
@@ -34,7 +34,7 @@ def converter():
return converter
def test_convert_single(converter: DocumentConverter):
def test_convert_single(converter: PdfDocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
@@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter):
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter):
def test_batch_path(converter: PdfDocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
@@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter):
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter):
def test_batch_bytes(converter: PdfDocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")