mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 14:18:30 +00:00
Update examples and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,9 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
@@ -22,14 +23,17 @@ def get_pdf_paths():
|
||||
|
||||
def get_converter():
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return converter
|
||||
|
||||
@@ -4,10 +4,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
@@ -21,14 +21,17 @@ def get_pdf_path():
|
||||
@pytest.fixture
|
||||
def converter():
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return converter
|
||||
@@ -61,7 +64,7 @@ def test_batch_bytes(converter: DocumentConverter):
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
buf = BytesIO(pdf_path.open("rb").read())
|
||||
docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
|
||||
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
||||
conv_input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
results = converter.convert(conv_input)
|
||||
|
||||
@@ -3,10 +3,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -17,15 +17,19 @@ def test_doc_path():
|
||||
def get_converters_with_table_options():
|
||||
for cell_matching in [True, False]:
|
||||
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = cell_matching
|
||||
pipeline_options.table_structure_options.mode = mode
|
||||
|
||||
converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
yield converter
|
||||
|
||||
Reference in New Issue
Block a user