mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 14:18:30 +00:00
feat!: simplify conversion API (#139)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
@@ -3,7 +3,7 @@ from pathlib import Path
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
@@ -48,7 +48,7 @@ def test_e2e_conversions():
|
||||
for pdf_path in pdf_paths:
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||
|
||||
verify_conversion_result_v1(
|
||||
input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1
|
||||
|
||||
@@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
@@ -90,7 +89,7 @@ def test_e2e_conversions():
|
||||
for pdf_path in pdf_paths:
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||
|
||||
# Save conversions
|
||||
# save_output(pdf_path, doc_result, None)
|
||||
|
||||
@@ -5,8 +5,7 @@ import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
@@ -37,39 +36,24 @@ def converter():
|
||||
return converter
|
||||
|
||||
|
||||
def test_convert_single(converter: DocumentConverter):
|
||||
def test_convert_path(converter: DocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
doc_result = converter.convert(pdf_path)
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
|
||||
|
||||
def test_batch_path(converter: DocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
conv_input = DocumentConversionInput.from_paths([pdf_path])
|
||||
|
||||
results = converter.convert_batch(conv_input)
|
||||
for doc_result in results:
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
|
||||
|
||||
def test_batch_bytes(converter: DocumentConverter):
|
||||
def test_convert_stream(converter: DocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
buf = BytesIO(pdf_path.open("rb").read())
|
||||
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
||||
conv_input = DocumentConversionInput.from_streams(docs)
|
||||
stream = DocumentStream(name=pdf_path.name, stream=buf)
|
||||
|
||||
results = converter.convert_batch(conv_input)
|
||||
for doc_result in results:
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
doc_result = converter.convert(stream)
|
||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||
|
||||
@@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path):
|
||||
for converter in get_converters_with_table_options():
|
||||
print(f"converting {test_doc_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(test_doc_path)
|
||||
doc_result: ConversionResult = converter.convert(test_doc_path)
|
||||
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
||||
|
||||
Reference in New Issue
Block a user