From 8777b759ae601d8e051c6a97bc4b1cf52072ddee Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:31:07 +0200 Subject: [PATCH] feat!: simplify conversion API Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/cli/main.py | 7 +- docling/datamodel/document.py | 36 ++------- docling/document_converter.py | 124 +++++++++++++------------------ docling/models/ds_glm_model.py | 4 +- examples/batch_convert.py | 11 +-- examples/custom_convert.py | 21 +----- examples/export_figures.py | 63 +++++----------- examples/export_multimodal.py | 94 ++++++++++------------- examples/export_tables.py | 58 +++++---------- examples/minimal.py | 2 +- examples/run_with_formats.py | 6 +- tests/test_e2e_conversion.py | 4 +- tests/test_e2e_ocr_conversion.py | 3 +- tests/test_interfaces.py | 32 ++------ tests/test_options.py | 2 +- 15 files changed, 164 insertions(+), 303 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index b925e796..03701ad3 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -13,7 +13,7 @@ from docling_core.utils.file import resolve_file_source from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat -from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, PdfPipelineOptions, @@ -231,12 +231,9 @@ def convert( } ) - # Define input files - input = DocumentConversionInput.from_paths(input_doc_paths) - start_time = time.time() - conv_results = doc_converter.convert_batch(input) + conv_results = doc_converter.convert_all(input_doc_paths) output.mkdir(parents=True, exist_ok=True) export_documents( diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index ede4e328..615acfac 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -19,6 +19,7 @@ from docling_core.types.experimental import ( DocItemLabel, DoclingDocument, ) +from docling_core.utils.file import resolve_file_source from pydantic import BaseModel from typing_extensions import deprecated @@ -162,8 +163,7 @@ class DocumentFormat(str, Enum): V1 = "v1" -@deprecated("Use `ConversionResult` instead.") -class ConvertedDocument(BaseModel): +class ConversionResult(BaseModel): input: InputDocument status: ConversionStatus = ConversionStatus.PENDING # failure, success @@ -457,20 +457,16 @@ class ConvertedDocument(BaseModel): yield element, cropped_im -class ConversionResult(ConvertedDocument): - pass +class _DocumentConversionInput(BaseModel): - -class DocumentConversionInput(BaseModel): - - _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None + path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] limits: Optional[DocumentLimits] = DocumentLimits() def docs( self, format_options: Dict[InputFormat, "FormatOption"] ) -> Iterable[InputDocument]: - - for obj in self._path_or_stream_iterator: + for item in self.path_or_stream_iterator: + obj = resolve_file_source(item) if isinstance(item, str) else item format = self._guess_format(obj) if format not in format_options.keys(): _log.debug( @@ -496,6 +492,8 @@ class DocumentConversionInput(BaseModel): limits=self.limits, backend=backend, ) + else: + raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}") def _guess_format(self, obj): content = None @@ -531,21 +529,3 @@ class DocumentConversionInput(BaseModel): return "text/html" return None - - @classmethod - def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None): - paths = [Path(p) for p in paths] - - doc_input = cls(limits=limits) - doc_input._path_or_stream_iterator = paths - - return doc_input - - @classmethod - def from_streams( - cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None - ): - doc_input = cls(limits=limits) - doc_input._path_or_stream_iterator = streams - - return doc_input diff --git a/docling/document_converter.py b/docling/document_converter.py index dc919883..f354d58b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,34 +1,24 @@ import logging -import tempfile +import sys import time from pathlib import Path from typing import Dict, Iterable, List, Optional, Type -import requests -from pydantic import ( - AnyHttpUrl, - BaseModel, - ConfigDict, - TypeAdapter, - ValidationError, - field_validator, - model_validator, -) -from typing_extensions import deprecated +from pydantic import BaseModel, ConfigDict, model_validator, validate_call from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend -from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat from docling.datamodel.document import ( ConversionResult, - DocumentConversionInput, InputDocument, + _DocumentConversionInput, ) from docling.datamodel.pipeline_options import PipelineOptions -from docling.datamodel.settings import settings +from docling.datamodel.settings import DocumentLimits, settings from docling.pipeline.base_model_pipeline import AbstractModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline @@ -119,16 +109,56 @@ class DocumentConverter: Type[AbstractModelPipeline], AbstractModelPipeline ] = {} - @deprecated("Use convert_batch instead.") - def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: - yield from self.convert_batch(input=input) + @validate_call(config=ConfigDict(strict=True)) + def convert( + self, + source: Path | str | DocumentStream, # TODO review naming + raises_on_error: bool = True, + max_num_pages: int = sys.maxsize, + max_file_size: int = sys.maxsize, + ) -> ConversionResult: - def convert_batch( - self, input: DocumentConversionInput, raise_on_error: bool = False + all_res = self.convert_all( + source=[source], + raises_on_error=raises_on_error, + max_num_pages=max_num_pages, + max_file_size=max_file_size, + ) + return next(all_res) + + @validate_call(config=ConfigDict(strict=True)) + def convert_all( + self, + source: Iterable[Path | str | DocumentStream], # TODO review naming + raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error + max_num_pages: int = sys.maxsize, + max_file_size: int = sys.maxsize, ) -> Iterable[ConversionResult]: + limits = DocumentLimits( + max_num_pages=max_num_pages, + max_file_size=max_file_size, + ) + conv_input = _DocumentConversionInput( + path_or_stream_iterator=source, + limit=limits, + ) + conv_res_iter = self._convert(conv_input) + for conv_res in conv_res_iter: + if raises_on_error and conv_res.status not in { + ConversionStatus.SUCCESS, + ConversionStatus.PARTIAL_SUCCESS, + }: + raise RuntimeError( + f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" + ) + else: + yield conv_res + def _convert( + self, conv_input: _DocumentConversionInput + ) -> Iterable[ConversionResult]: for input_batch in chunkify( - input.docs(self.format_to_options), + conv_input.docs(self.format_to_options), settings.perf.doc_batch_size, # pass format_options ): _log.info(f"Going to convert document batch...") @@ -143,58 +173,6 @@ class DocumentConverter: if item is not None: yield item - def convert_single( - self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False - ) -> ConversionResult: - """Convert a single document. - - Args: - source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL. - - Raises: - ValueError: If source is of unexpected type. - RuntimeError: If conversion fails. - - Returns: - ConversionResult: The conversion result object. - """ - with tempfile.TemporaryDirectory() as temp_dir: - try: - http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source) - res = requests.get(http_url, stream=True) - res.raise_for_status() - fname = None - # try to get filename from response header - if cont_disp := res.headers.get("Content-Disposition"): - for par in cont_disp.strip().split(";"): - # currently only handling directive "filename" (not "*filename") - if (split := par.split("=")) and split[0].strip() == "filename": - fname = "=".join(split[1:]).strip().strip("'\"") or None - break - # otherwise, use name from URL: - if fname is None: - fname = Path(http_url.path).name or self._default_download_filename - local_path = Path(temp_dir) / fname - with open(local_path, "wb") as f: - for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks - f.write(chunk) - except ValidationError: - try: - local_path = TypeAdapter(Path).validate_python(source) - except ValidationError: - raise ValueError( - f"Unexpected file path type encountered: {type(source)}" - ) - conv_inp = DocumentConversionInput.from_paths(paths=[local_path]) - conv_res_iter = self.convert_batch(conv_inp) - conv_res: ConversionResult = next(conv_res_iter) - if conv_res.status not in { - ConversionStatus.SUCCESS, - ConversionStatus.PARTIAL_SUCCESS, - }: - raise RuntimeError(f"Conversion failed with status: {conv_res.status}") - return conv_res - def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]: fopt = self.format_to_options.get(doc.format) diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 5fa35af1..e44f493a 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -14,13 +14,15 @@ from docling_core.types import Ref from docling_core.types.experimental import BoundingBox, CoordOrigin from docling_core.types.experimental.document import DoclingDocument from PIL import ImageDraw -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from docling.datamodel.base_models import Cluster from docling.datamodel.document import ConversionResult class GlmOptions(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + create_legacy_output: bool = True model_names: str = "" # e.g. "language;term;reference" diff --git a/examples/batch_convert.py b/examples/batch_convert.py index e54193f0..0cf2d650 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -7,7 +7,7 @@ from typing import Iterable import yaml from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.document import ConversionResult from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -125,18 +125,19 @@ def main(): doc_converter = DocumentConverter() - input = DocumentConversionInput.from_paths(input_doc_paths) - start_time = time.time() - conv_results = doc_converter.convert_batch(input) + conv_results = doc_converter.convert_all( + input_doc_paths, + raises_on_error=False, # to let conversion run through all and examine results at the end + ) success_count, partial_success_count, failure_count = export_documents( conv_results, output_dir=Path("./scratch") ) end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") + _log.info(f"Document conversion complete in {end_time:.2f} seconds.") if failure_count > 0: raise RuntimeError( diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 0805837b..70d86520 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Iterable from docling.datamodel.base_models import ConversionStatus, InputFormat -from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TesseractCliOcrOptions, @@ -65,9 +65,7 @@ def export_documents( def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") ########################################################################### @@ -152,24 +150,13 @@ def main(): ########################################################################### - # Define input files - input = DocumentConversionInput.from_paths(input_doc_paths) - start_time = time.time() - conv_results = doc_converter.convert_batch(input) - success_count, failure_count = export_documents( - conv_results, output_dir=Path("./scratch") - ) + conv_result = doc_converter.convert(input_doc_path) end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info(f"Document converted in {end_time:.2f} seconds.") if __name__ == "__main__": diff --git a/examples/export_figures.py b/examples/export_figures.py index 23f1bd20..4fa4dc58 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -2,13 +2,7 @@ import logging import time from pathlib import Path -from docling.datamodel.base_models import ( - ConversionStatus, - FigureElement, - InputFormat, - Table, -) -from docling.datamodel.document import DocumentConversionInput +from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") output_dir = Path("./scratch") - input_files = DocumentConversionInput.from_paths(input_doc_paths) - # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. @@ -42,46 +32,29 @@ def main(): start_time = time.time() - conv_results = doc_converter.convert_batch(input_files) + conv_res = doc_converter.convert(input_doc_path) - success_count = 0 - failure_count = 0 output_dir.mkdir(parents=True, exist_ok=True) - for conv_res in conv_results: - if conv_res.status != ConversionStatus.SUCCESS: - _log.info(f"Document {conv_res.input.file} failed to convert.") - failure_count += 1 - continue + doc_filename = conv_res.input.file.stem - doc_filename = conv_res.input.file.stem + # Export page images + for page in conv_res.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") - # Export page images - for page in conv_res.pages: - page_no = page.page_no + 1 - page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" - with page_image_filename.open("wb") as fp: - page.image.save(fp, format="PNG") - - # Export figures and tables - for element, image in conv_res.render_element_images( - element_types=(FigureElement, Table) - ): - element_image_filename = ( - output_dir / f"{doc_filename}-element-{element.id}.png" - ) - with element_image_filename.open("wb") as fp: - image.save(fp, "PNG") - - success_count += 1 + # Export figures and tables + for element, image in conv_res.render_element_images( + element_types=(FigureElement, Table) + ): + element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png" + with element_image_filename.open("wb") as fp: + image.save(fp, "PNG") end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.") if __name__ == "__main__": diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py index 11dd3f41..af569131 100644 --- a/examples/export_multimodal.py +++ b/examples/export_multimodal.py @@ -5,8 +5,7 @@ from pathlib import Path import pandas as pd -from docling.datamodel.base_models import ConversionStatus, InputFormat -from docling.datamodel.document import DocumentConversionInput +from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from docling.utils.export import generate_multimodal_pages @@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") output_dir = Path("./scratch") - input_files = DocumentConversionInput.from_paths(input_doc_paths) - # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. @@ -41,53 +36,45 @@ def main(): start_time = time.time() - converted_docs = doc_converter.convert_batch(input_files) + conv_res = doc_converter.convert(input_doc_path) - success_count = 0 - failure_count = 0 output_dir.mkdir(parents=True, exist_ok=True) - for doc in converted_docs: - if doc.status != ConversionStatus.SUCCESS: - _log.info(f"Document {doc.input.file} failed to convert.") - failure_count += 1 - continue - rows = [] - for ( - content_text, - content_md, - content_dt, - page_cells, - page_segments, - page, - ) in generate_multimodal_pages(doc): + rows = [] + for ( + content_text, + content_md, + content_dt, + page_cells, + page_segments, + page, + ) in generate_multimodal_pages(conv_res): - dpi = page._default_image_scale * 72 + dpi = page._default_image_scale * 72 - rows.append( - { - "document": doc.input.file.name, - "hash": doc.input.document_hash, - "page_hash": page.page_hash, - "image": { - "width": page.image.width, - "height": page.image.height, - "bytes": page.image.tobytes(), - }, - "cells": page_cells, - "contents": content_text, - "contents_md": content_md, - "contents_dt": content_dt, - "segments": page_segments, - "extra": { - "page_num": page.page_no + 1, - "width_in_points": page.size.width, - "height_in_points": page.size.height, - "dpi": dpi, - }, - } - ) - success_count += 1 + rows.append( + { + "document": conv_res.input.file.name, + "hash": conv_res.input.document_hash, + "page_hash": page.page_hash, + "image": { + "width": page.image.width, + "height": page.image.height, + "bytes": page.image.tobytes(), + }, + "cells": page_cells, + "contents": content_text, + "contents_md": content_md, + "contents_dt": content_dt, + "segments": page_segments, + "extra": { + "page_num": page.page_no + 1, + "width_in_points": page.size.width, + "height_in_points": page.size.height, + "dpi": dpi, + }, + } + ) # Generate one parquet from all documents df = pd.json_normalize(rows) @@ -97,12 +84,9 @@ def main(): end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info( + f"Document converted and multimodal pages generated in {end_time:.2f} seconds." + ) # This block demonstrates how the file can be opened with the HF datasets library # from datasets import Dataset diff --git a/examples/export_tables.py b/examples/export_tables.py index 720e8c67..79a3333d 100644 --- a/examples/export_tables.py +++ b/examples/export_tables.py @@ -4,8 +4,6 @@ from pathlib import Path import pandas as pd -from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import DocumentConversionInput from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -14,59 +12,39 @@ _log = logging.getLogger(__name__) def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") output_dir = Path("./scratch") - input_files = DocumentConversionInput.from_paths(input_doc_paths) - doc_converter = DocumentConverter() start_time = time.time() - conv_results = doc_converter.convert_batch(input_files) + conv_res = doc_converter.convert(input_doc_path) - success_count = 0 - failure_count = 0 output_dir.mkdir(parents=True, exist_ok=True) - for conv_res in conv_results: - if conv_res.status != ConversionStatus.SUCCESS: - _log.info(f"Document {conv_res.input.file} failed to convert.") - failure_count += 1 - continue - doc_filename = conv_res.input.file.stem + doc_filename = conv_res.input.file.stem - # Export tables - for table_ix, table in enumerate(conv_res.legacy_output.tables): - table_df: pd.DataFrame = table.export_to_dataframe() - print(f"## Table {table_ix}") - print(table_df.to_markdown()) + # Export tables + for table_ix, table in enumerate(conv_res.legacy_output.tables): + table_df: pd.DataFrame = table.export_to_dataframe() + print(f"## Table {table_ix}") + print(table_df.to_markdown()) - # Save the table as csv - element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" - _log.info(f"Saving CSV table to {element_csv_filename}") - table_df.to_csv(element_csv_filename) + # Save the table as csv + element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" + _log.info(f"Saving CSV table to {element_csv_filename}") + table_df.to_csv(element_csv_filename) - # Save the table as html - element_html_filename = ( - output_dir / f"{doc_filename}-table-{table_ix+1}.html" - ) - _log.info(f"Saving HTML table to {element_html_filename}") - with element_html_filename.open("w") as fp: - fp.write(table.export_to_html()) - - success_count += 1 + # Save the table as html + element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html" + _log.info(f"Saving HTML table to {element_html_filename}") + with element_html_filename.open("w") as fp: + fp.write(table.export_to_html()) end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.") if __name__ == "__main__": diff --git a/examples/minimal.py b/examples/minimal.py index fb84cca4..55cdfc46 100644 --- a/examples/minimal.py +++ b/examples/minimal.py @@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL converter = DocumentConverter() -result = converter.convert_single(source) +result = converter.convert(source) print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]" # if the legacy output is needed, use this version # print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]" diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index f086bae2..37bb1b1a 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -6,7 +6,6 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import DocumentConversionInput from docling.document_converter import ( DocumentConverter, FormatOption, @@ -28,7 +27,6 @@ input_paths = [ Path("tests/data/2206.01062.pdf"), # Path("tests/data/2305.03393v1-pg9-img.png"), ] -input = DocumentConversionInput.from_paths(input_paths) ## for defaults use: # doc_converter = DocumentConverter() @@ -52,12 +50,12 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal }, ) -conv_results = doc_converter.convert_batch(input) +conv_results = doc_converter.convert_all(input_paths) for res in conv_results: out_path = Path("./scratch") print( - f"Document {res.input.file.name} converted with status {res.status}." + f"Document {res.input.file.name} converted." f"\nSaved markdown output to: {str(out_path)}" ) # print(res.experimental.export_to_markdown()) diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index d7432a10..c18a7a5b 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -3,7 +3,7 @@ from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 @@ -48,7 +48,7 @@ def test_e2e_conversions(): for pdf_path in pdf_paths: print(f"converting {pdf_path}") - doc_result: ConversionResult = converter.convert_single(pdf_path) + doc_result: ConversionResult = converter.convert(pdf_path) verify_conversion_result_v1( input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1 diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index ee7f3931..86c22554 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, OcrOptions, PdfPipelineOptions, - PipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -90,7 +89,7 @@ def test_e2e_conversions(): for pdf_path in pdf_paths: print(f"converting {pdf_path}") - doc_result: ConversionResult = converter.convert_single(pdf_path) + doc_result: ConversionResult = converter.convert(pdf_path) # Save conversions # save_output(pdf_path, doc_result, None) diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 80f5ea4e..9ef3d131 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -5,8 +5,7 @@ import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import DocumentStream, InputFormat -from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 @@ -37,39 +36,24 @@ def converter(): return converter -def test_convert_single(converter: DocumentConverter): +def test_convert_path(converter: DocumentConverter): pdf_path = get_pdf_path() print(f"converting {pdf_path}") - doc_result: ConversionResult = converter.convert_single(pdf_path) + doc_result = converter.convert(pdf_path) verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) -def test_batch_path(converter: DocumentConverter): - - pdf_path = get_pdf_path() - print(f"converting {pdf_path}") - - conv_input = DocumentConversionInput.from_paths([pdf_path]) - - results = converter.convert_batch(conv_input) - for doc_result in results: - verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) - verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) - - -def test_batch_bytes(converter: DocumentConverter): +def test_convert_stream(converter: DocumentConverter): pdf_path = get_pdf_path() print(f"converting {pdf_path}") buf = BytesIO(pdf_path.open("rb").read()) - docs = [DocumentStream(name=pdf_path.name, stream=buf)] - conv_input = DocumentConversionInput.from_streams(docs) + stream = DocumentStream(name=pdf_path.name, stream=buf) - results = converter.convert_batch(conv_input) - for doc_result in results: - verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) - verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) + doc_result = converter.convert(stream) + verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) + verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) diff --git a/tests/test_options.py b/tests/test_options.py index 8b35811b..ad6c7a45 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path): for converter in get_converters_with_table_options(): print(f"converting {test_doc_path}") - doc_result: ConversionResult = converter.convert_single(test_doc_path) + doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS