Merge from simplify-conv-api

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-11 15:57:08 +02:00
22 changed files with 286 additions and 380 deletions

View File

@@ -48,7 +48,7 @@ def test_e2e_conversions():
for pdf_path in pdf_paths:
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
doc_result: ConversionResult = converter.convert(pdf_path)
verify_conversion_result_v1(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1

View File

@@ -89,7 +89,7 @@ def test_e2e_conversions():
for pdf_path in pdf_paths:
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
doc_result: ConversionResult = converter.convert(pdf_path)
# Save conversions
# save_output(pdf_path, doc_result, None)

View File

@@ -5,7 +5,6 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -37,39 +36,24 @@ def converter():
return converter
def test_convert_single(converter: DocumentConverter):
def test_convert_path(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
doc_result = converter.convert(pdf_path)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
conv_input = DocumentConversionInput.from_paths([pdf_path])
results = converter.convert_batch(conv_input)
for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter):
def test_convert_stream(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
buf = BytesIO(pdf_path.open("rb").read())
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs)
stream = DocumentStream(name=pdf_path.name, stream=buf)
results = converter.convert_batch(conv_input)
for doc_result in results:
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
doc_result = converter.convert(stream)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)

View File

@@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path):
for converter in get_converters_with_table_options():
print(f"converting {test_doc_path}")
doc_result: ConversionResult = converter.convert_single(test_doc_path)
doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS

View File

@@ -1,4 +1,5 @@
import json
import warnings
from pathlib import Path
from typing import List
@@ -234,8 +235,10 @@ def verify_conversion_result_v1(
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_output
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name