From c6e1471e0237a71ec057309f2ba66b1ecebca365 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 11 Oct 2024 12:58:59 +0200 Subject: [PATCH 1/4] use options objects Signed-off-by: Michele Dolfi --- docling/datamodel/pipeline_options.py | 15 ---- docling/models/layout_model.py | 8 +- docling/models/page_assemble_model.py | 12 ++- docling/models/page_preprocessing_model.py | 11 ++- docling/models/table_structure_model.py | 16 ++-- .../pipeline/standard_pdf_model_pipeline.py | 76 +++++++++++-------- tests/verify_utils.py | 7 +- 7 files changed, 78 insertions(+), 67 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 4be6fcec..10d5b14f 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -72,19 +72,4 @@ class PdfPipelineOptions(PipelineOptions): Field(EasyOcrOptions(), discriminator="kind") ) - keep_page_images: Annotated[ - bool, - Field( - deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead" - ), - ] = False # False: page images are removed in the assemble step images_scale: Optional[float] = None # if set, the scale for generated images - - @model_validator(mode="after") - def set_page_images_from_deprecated(self) -> "PdfPipelineOptions": - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - default_scale = 1.0 - if self.keep_page_images and self.images_scale is None: - self.images_scale = default_scale - return self diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 10b7f196..1240b456 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -2,6 +2,7 @@ import copy import logging import random import time +from pathlib import Path from typing import Iterable, List from docling_core.types.experimental import CoordOrigin @@ -43,11 +44,8 @@ class LayoutModel(AbstractPageModel): FIGURE_LABEL = DocItemLabel.PICTURE FORMULA_LABEL = DocItemLabel.FORMULA - def __init__(self, config): - self.config = config - self.layout_predictor = LayoutPredictor( - config["artifacts_path"] - ) # TODO temporary + def __init__(self, artifacts_path: Path): + self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height): MIN_INTERSECTION = 0.2 diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 28f93c12..44743339 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -2,6 +2,8 @@ import logging import re from typing import Iterable, List +from pydantic import BaseModel + from docling.datamodel.base_models import ( AssembledUnit, FigureElement, @@ -16,9 +18,13 @@ from docling.models.layout_model import LayoutModel _log = logging.getLogger(__name__) +class PageAssembleOptions(BaseModel): + keep_images: bool = False + + class PageAssembleModel(AbstractPageModel): - def __init__(self, config): - self.config = config + def __init__(self, options: PageAssembleOptions): + self.options = options def sanitize_text(self, lines): if len(lines) <= 1: @@ -147,7 +153,7 @@ class PageAssembleModel(AbstractPageModel): ) # Remove page images (can be disabled) - if self.config["images_scale"] is None: + if not self.options.keep_images: page._image_cache = {} # Unload backend diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 3683123c..438114d8 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,14 +1,19 @@ from typing import Iterable from PIL import ImageDraw +from pydantic import BaseModel from docling.datamodel.base_models import Page from docling.models.abstract_model import AbstractPageModel +class PagePreprocessingOptions(BaseModel): + images_scale: float + + class PagePreprocessingModel(AbstractPageModel): - def __init__(self, config): - self.config = config + def __init__(self, options: PagePreprocessingOptions): + self.options = options def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: @@ -23,7 +28,7 @@ class PagePreprocessingModel(AbstractPageModel): scale=1.0 ) # puts the page image on the image cache at default scale - images_scale = self.config["images_scale"] + images_scale = self.options.images_scale # user requested scales if images_scale is not None: page._default_image_scale = images_scale diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 4a00e55d..4433930a 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -10,19 +10,21 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic from PIL import ImageDraw from docling.datamodel.base_models import Page, Table, TableStructurePrediction -from docling.datamodel.pipeline_options import TableFormerMode +from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions from docling.models.abstract_model import AbstractPageModel class TableStructureModel(AbstractPageModel): - def __init__(self, config): - self.config = config - self.do_cell_matching = config["do_cell_matching"] - self.mode = config["mode"] + def __init__( + self, enabled: bool, artifacts_path: Path, options: TableStructureOptions + ): + self.options = options + self.do_cell_matching = self.options.do_cell_matching + self.mode = self.options.mode - self.enabled = config["enabled"] + self.enabled = enabled if self.enabled: - artifacts_path: Path = config["artifacts_path"] + artifacts_path: Path = artifacts_path if self.mode == TableFormerMode.ACCURATE: artifacts_path = artifacts_path / "fat" diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py index dba1f3dc..c50a5552 100644 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -16,8 +16,11 @@ from docling.models.base_ocr_model import BaseOcrModel from docling.models.ds_glm_model import GlmModel from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel -from docling.models.page_assemble_model import PageAssembleModel -from docling.models.page_preprocessing_model import PagePreprocessingModel +from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions +from docling.models.page_preprocessing_model import ( + PagePreprocessingModel, + PagePreprocessingOptions, +) from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel @@ -32,6 +35,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): def __init__(self, pipeline_options: PdfPipelineOptions): super().__init__(pipeline_options) + self.pipeline_options: PdfPipelineOptions if not pipeline_options.artifacts_path: artifacts_path = self.download_models_hf() @@ -39,48 +43,38 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): self.artifacts_path = Path(artifacts_path) self.glm_model = GlmModel(config={}) - ocr_model: BaseOcrModel - if isinstance(pipeline_options.ocr_options, EasyOcrOptions): - ocr_model = EasyOcrModel( - enabled=pipeline_options.do_ocr, - options=pipeline_options.ocr_options, - ) - elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions): - ocr_model = TesseractOcrCliModel( - enabled=pipeline_options.do_ocr, - options=pipeline_options.ocr_options, - ) - elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): - ocr_model = TesseractOcrModel( - enabled=pipeline_options.do_ocr, - options=pipeline_options.ocr_options, - ) - else: + if ocr_model := self.get_ocr_model() is None: raise RuntimeError( f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." ) self.model_pipe = [ + # Pre-processing PagePreprocessingModel( - config={"images_scale": pipeline_options.images_scale} + options=PagePreprocessingOptions( + images_scale=pipeline_options.images_scale + ) ), + # OCR ocr_model, + # Layout model LayoutModel( - config={ - "artifacts_path": artifacts_path - / StandardPdfModelPipeline._layout_model_path - } + artifacts_path=artifacts_path + / StandardPdfModelPipeline._layout_model_path ), + # Table structure model TableStructureModel( - config={ - "artifacts_path": artifacts_path - / StandardPdfModelPipeline._table_model_path, - "enabled": pipeline_options.do_table_structure, - "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching, - "mode": pipeline_options.table_structure_options.mode, - } + enabled=pipeline_options.do_table_structure, + artifacts_path=artifacts_path + / StandardPdfModelPipeline._table_model_path, + options=pipeline_options.table_structure_options, + ), + # Page assemble + PageAssembleModel( + options=PageAssembleOptions( + keep_images=pipeline_options.images_scale is not None + ) ), - PageAssembleModel(config={"images_scale": pipeline_options.images_scale}), ] self.enrichment_pipe = [ @@ -102,6 +96,24 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): return Path(download_path) + def get_ocr_model(self) -> Optional[BaseOcrModel]: + if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions): + return EasyOcrModel( + enabled=self.pipeline_options.do_ocr, + options=self.pipeline_options.ocr_options, + ) + elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions): + return TesseractOcrCliModel( + enabled=self.pipeline_options.do_ocr, + options=self.pipeline_options.ocr_options, + ) + elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions): + return TesseractOcrModel( + enabled=self.pipeline_options.do_ocr, + options=self.pipeline_options.ocr_options, + ) + return None + def initialize_page(self, doc: InputDocument, page: Page) -> Page: page._backend = doc._backend.load_page(page.page_no) page.size = page._backend.get_size() diff --git a/tests/verify_utils.py b/tests/verify_utils.py index fb888bf8..186f2d9b 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -1,4 +1,5 @@ import json +import warnings from pathlib import Path from typing import List @@ -235,8 +236,10 @@ def verify_conversion_result_v1( doc_pred_pages: List[Page] = doc_result.pages doc_pred: DsDocument = doc_result.legacy_output - doc_pred_md = doc_result.render_as_markdown() - doc_pred_dt = doc_result.render_as_doctags() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + doc_pred_md = doc_result.render_as_markdown() + doc_pred_dt = doc_result.render_as_doctags() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name From 94b5e1532dcca7a8b716400b84d536fd8b1a9ef2 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 11 Oct 2024 13:03:38 +0200 Subject: [PATCH 2/4] add GlmOptions Signed-off-by: Michele Dolfi --- docling/models/ds_glm_model.py | 21 ++++++++++--------- .../pipeline/standard_pdf_model_pipeline.py | 6 ++++-- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 529b12ce..5fa35af1 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -14,23 +14,24 @@ from docling_core.types import Ref from docling_core.types.experimental import BoundingBox, CoordOrigin from docling_core.types.experimental.document import DoclingDocument from PIL import ImageDraw +from pydantic import BaseModel from docling.datamodel.base_models import Cluster from docling.datamodel.document import ConversionResult -class GlmModel: - def __init__(self, config): - self.config = config - self.create_legacy_output = config.get("create_legacy_output", True) +class GlmOptions(BaseModel): + create_legacy_output: bool = True + model_names: str = "" # e.g. "language;term;reference" + + +class GlmModel: + def __init__(self, options: GlmOptions): + self.options = options + self.create_legacy_output = self.options.create_legacy_output - self.model_names = self.config.get( - "model_names", "" - ) # "language;term;reference" load_pretrained_nlp_models() - # model = init_nlp_model(model_names="language;term;reference") - model = init_nlp_model(model_names=self.model_names) - self.model = model + self.model = init_nlp_model(model_names=self.options.model_names) def __call__( self, conv_res: ConversionResult diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py index 3ec4c17e..cba8609b 100644 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -13,7 +13,7 @@ from docling.datamodel.pipeline_options import ( TesseractOcrOptions, ) from docling.models.base_ocr_model import BaseOcrModel -from docling.models.ds_glm_model import GlmModel +from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions @@ -42,7 +42,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): self.artifacts_path = Path(artifacts_path) self.glm_model = GlmModel( - config={"create_legacy_output": pipeline_options.create_legacy_output} + options=GlmOptions( + create_legacy_output=pipeline_options.create_legacy_output + ) ) if ocr_model := self.get_ocr_model() is None: From 753f67a434960cbc41cffa429e194e5903ef3b91 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 11 Oct 2024 13:06:32 +0200 Subject: [PATCH 3/4] fixes Signed-off-by: Michele Dolfi --- docling/models/page_preprocessing_model.py | 4 ++-- docling/pipeline/standard_pdf_model_pipeline.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 438114d8..52f92129 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,4 +1,4 @@ -from typing import Iterable +from typing import Iterable, Optional from PIL import ImageDraw from pydantic import BaseModel @@ -8,7 +8,7 @@ from docling.models.abstract_model import AbstractPageModel class PagePreprocessingOptions(BaseModel): - images_scale: float + images_scale: Optional[float] class PagePreprocessingModel(AbstractPageModel): diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py index cba8609b..53558579 100644 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -47,7 +47,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): ) ) - if ocr_model := self.get_ocr_model() is None: + if (ocr_model := self.get_ocr_model()) is None: raise RuntimeError( f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." ) From 136f16e85a8290b4d39905f6f1e6e6d6b372257e Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:52:37 +0200 Subject: [PATCH 4/4] feat!: simplify conversion API (#139) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/cli/main.py | 7 +- docling/datamodel/document.py | 36 ++------- docling/document_converter.py | 124 +++++++++++++------------------ docling/models/ds_glm_model.py | 4 +- examples/batch_convert.py | 11 +-- examples/custom_convert.py | 21 +----- examples/export_figures.py | 63 +++++----------- examples/export_multimodal.py | 94 ++++++++++------------- examples/export_tables.py | 58 +++++---------- examples/minimal.py | 2 +- examples/run_with_formats.py | 6 +- tests/test_e2e_conversion.py | 4 +- tests/test_e2e_ocr_conversion.py | 3 +- tests/test_interfaces.py | 32 ++------ tests/test_options.py | 2 +- 15 files changed, 164 insertions(+), 303 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index b925e796..03701ad3 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -13,7 +13,7 @@ from docling_core.utils.file import resolve_file_source from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat -from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, PdfPipelineOptions, @@ -231,12 +231,9 @@ def convert( } ) - # Define input files - input = DocumentConversionInput.from_paths(input_doc_paths) - start_time = time.time() - conv_results = doc_converter.convert_batch(input) + conv_results = doc_converter.convert_all(input_doc_paths) output.mkdir(parents=True, exist_ok=True) export_documents( diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index ede4e328..615acfac 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -19,6 +19,7 @@ from docling_core.types.experimental import ( DocItemLabel, DoclingDocument, ) +from docling_core.utils.file import resolve_file_source from pydantic import BaseModel from typing_extensions import deprecated @@ -162,8 +163,7 @@ class DocumentFormat(str, Enum): V1 = "v1" -@deprecated("Use `ConversionResult` instead.") -class ConvertedDocument(BaseModel): +class ConversionResult(BaseModel): input: InputDocument status: ConversionStatus = ConversionStatus.PENDING # failure, success @@ -457,20 +457,16 @@ class ConvertedDocument(BaseModel): yield element, cropped_im -class ConversionResult(ConvertedDocument): - pass +class _DocumentConversionInput(BaseModel): - -class DocumentConversionInput(BaseModel): - - _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None + path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] limits: Optional[DocumentLimits] = DocumentLimits() def docs( self, format_options: Dict[InputFormat, "FormatOption"] ) -> Iterable[InputDocument]: - - for obj in self._path_or_stream_iterator: + for item in self.path_or_stream_iterator: + obj = resolve_file_source(item) if isinstance(item, str) else item format = self._guess_format(obj) if format not in format_options.keys(): _log.debug( @@ -496,6 +492,8 @@ class DocumentConversionInput(BaseModel): limits=self.limits, backend=backend, ) + else: + raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}") def _guess_format(self, obj): content = None @@ -531,21 +529,3 @@ class DocumentConversionInput(BaseModel): return "text/html" return None - - @classmethod - def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None): - paths = [Path(p) for p in paths] - - doc_input = cls(limits=limits) - doc_input._path_or_stream_iterator = paths - - return doc_input - - @classmethod - def from_streams( - cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None - ): - doc_input = cls(limits=limits) - doc_input._path_or_stream_iterator = streams - - return doc_input diff --git a/docling/document_converter.py b/docling/document_converter.py index dc919883..f354d58b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,34 +1,24 @@ import logging -import tempfile +import sys import time from pathlib import Path from typing import Dict, Iterable, List, Optional, Type -import requests -from pydantic import ( - AnyHttpUrl, - BaseModel, - ConfigDict, - TypeAdapter, - ValidationError, - field_validator, - model_validator, -) -from typing_extensions import deprecated +from pydantic import BaseModel, ConfigDict, model_validator, validate_call from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend -from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat from docling.datamodel.document import ( ConversionResult, - DocumentConversionInput, InputDocument, + _DocumentConversionInput, ) from docling.datamodel.pipeline_options import PipelineOptions -from docling.datamodel.settings import settings +from docling.datamodel.settings import DocumentLimits, settings from docling.pipeline.base_model_pipeline import AbstractModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline @@ -119,16 +109,56 @@ class DocumentConverter: Type[AbstractModelPipeline], AbstractModelPipeline ] = {} - @deprecated("Use convert_batch instead.") - def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: - yield from self.convert_batch(input=input) + @validate_call(config=ConfigDict(strict=True)) + def convert( + self, + source: Path | str | DocumentStream, # TODO review naming + raises_on_error: bool = True, + max_num_pages: int = sys.maxsize, + max_file_size: int = sys.maxsize, + ) -> ConversionResult: - def convert_batch( - self, input: DocumentConversionInput, raise_on_error: bool = False + all_res = self.convert_all( + source=[source], + raises_on_error=raises_on_error, + max_num_pages=max_num_pages, + max_file_size=max_file_size, + ) + return next(all_res) + + @validate_call(config=ConfigDict(strict=True)) + def convert_all( + self, + source: Iterable[Path | str | DocumentStream], # TODO review naming + raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error + max_num_pages: int = sys.maxsize, + max_file_size: int = sys.maxsize, ) -> Iterable[ConversionResult]: + limits = DocumentLimits( + max_num_pages=max_num_pages, + max_file_size=max_file_size, + ) + conv_input = _DocumentConversionInput( + path_or_stream_iterator=source, + limit=limits, + ) + conv_res_iter = self._convert(conv_input) + for conv_res in conv_res_iter: + if raises_on_error and conv_res.status not in { + ConversionStatus.SUCCESS, + ConversionStatus.PARTIAL_SUCCESS, + }: + raise RuntimeError( + f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" + ) + else: + yield conv_res + def _convert( + self, conv_input: _DocumentConversionInput + ) -> Iterable[ConversionResult]: for input_batch in chunkify( - input.docs(self.format_to_options), + conv_input.docs(self.format_to_options), settings.perf.doc_batch_size, # pass format_options ): _log.info(f"Going to convert document batch...") @@ -143,58 +173,6 @@ class DocumentConverter: if item is not None: yield item - def convert_single( - self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False - ) -> ConversionResult: - """Convert a single document. - - Args: - source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL. - - Raises: - ValueError: If source is of unexpected type. - RuntimeError: If conversion fails. - - Returns: - ConversionResult: The conversion result object. - """ - with tempfile.TemporaryDirectory() as temp_dir: - try: - http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source) - res = requests.get(http_url, stream=True) - res.raise_for_status() - fname = None - # try to get filename from response header - if cont_disp := res.headers.get("Content-Disposition"): - for par in cont_disp.strip().split(";"): - # currently only handling directive "filename" (not "*filename") - if (split := par.split("=")) and split[0].strip() == "filename": - fname = "=".join(split[1:]).strip().strip("'\"") or None - break - # otherwise, use name from URL: - if fname is None: - fname = Path(http_url.path).name or self._default_download_filename - local_path = Path(temp_dir) / fname - with open(local_path, "wb") as f: - for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks - f.write(chunk) - except ValidationError: - try: - local_path = TypeAdapter(Path).validate_python(source) - except ValidationError: - raise ValueError( - f"Unexpected file path type encountered: {type(source)}" - ) - conv_inp = DocumentConversionInput.from_paths(paths=[local_path]) - conv_res_iter = self.convert_batch(conv_inp) - conv_res: ConversionResult = next(conv_res_iter) - if conv_res.status not in { - ConversionStatus.SUCCESS, - ConversionStatus.PARTIAL_SUCCESS, - }: - raise RuntimeError(f"Conversion failed with status: {conv_res.status}") - return conv_res - def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]: fopt = self.format_to_options.get(doc.format) diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 5fa35af1..e44f493a 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -14,13 +14,15 @@ from docling_core.types import Ref from docling_core.types.experimental import BoundingBox, CoordOrigin from docling_core.types.experimental.document import DoclingDocument from PIL import ImageDraw -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from docling.datamodel.base_models import Cluster from docling.datamodel.document import ConversionResult class GlmOptions(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + create_legacy_output: bool = True model_names: str = "" # e.g. "language;term;reference" diff --git a/examples/batch_convert.py b/examples/batch_convert.py index e54193f0..0cf2d650 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -7,7 +7,7 @@ from typing import Iterable import yaml from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.document import ConversionResult from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -125,18 +125,19 @@ def main(): doc_converter = DocumentConverter() - input = DocumentConversionInput.from_paths(input_doc_paths) - start_time = time.time() - conv_results = doc_converter.convert_batch(input) + conv_results = doc_converter.convert_all( + input_doc_paths, + raises_on_error=False, # to let conversion run through all and examine results at the end + ) success_count, partial_success_count, failure_count = export_documents( conv_results, output_dir=Path("./scratch") ) end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") + _log.info(f"Document conversion complete in {end_time:.2f} seconds.") if failure_count > 0: raise RuntimeError( diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 0805837b..70d86520 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Iterable from docling.datamodel.base_models import ConversionStatus, InputFormat -from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TesseractCliOcrOptions, @@ -65,9 +65,7 @@ def export_documents( def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") ########################################################################### @@ -152,24 +150,13 @@ def main(): ########################################################################### - # Define input files - input = DocumentConversionInput.from_paths(input_doc_paths) - start_time = time.time() - conv_results = doc_converter.convert_batch(input) - success_count, failure_count = export_documents( - conv_results, output_dir=Path("./scratch") - ) + conv_result = doc_converter.convert(input_doc_path) end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info(f"Document converted in {end_time:.2f} seconds.") if __name__ == "__main__": diff --git a/examples/export_figures.py b/examples/export_figures.py index 23f1bd20..4fa4dc58 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -2,13 +2,7 @@ import logging import time from pathlib import Path -from docling.datamodel.base_models import ( - ConversionStatus, - FigureElement, - InputFormat, - Table, -) -from docling.datamodel.document import DocumentConversionInput +from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") output_dir = Path("./scratch") - input_files = DocumentConversionInput.from_paths(input_doc_paths) - # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. @@ -42,46 +32,29 @@ def main(): start_time = time.time() - conv_results = doc_converter.convert_batch(input_files) + conv_res = doc_converter.convert(input_doc_path) - success_count = 0 - failure_count = 0 output_dir.mkdir(parents=True, exist_ok=True) - for conv_res in conv_results: - if conv_res.status != ConversionStatus.SUCCESS: - _log.info(f"Document {conv_res.input.file} failed to convert.") - failure_count += 1 - continue + doc_filename = conv_res.input.file.stem - doc_filename = conv_res.input.file.stem + # Export page images + for page in conv_res.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") - # Export page images - for page in conv_res.pages: - page_no = page.page_no + 1 - page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" - with page_image_filename.open("wb") as fp: - page.image.save(fp, format="PNG") - - # Export figures and tables - for element, image in conv_res.render_element_images( - element_types=(FigureElement, Table) - ): - element_image_filename = ( - output_dir / f"{doc_filename}-element-{element.id}.png" - ) - with element_image_filename.open("wb") as fp: - image.save(fp, "PNG") - - success_count += 1 + # Export figures and tables + for element, image in conv_res.render_element_images( + element_types=(FigureElement, Table) + ): + element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png" + with element_image_filename.open("wb") as fp: + image.save(fp, "PNG") end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.") if __name__ == "__main__": diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py index 11dd3f41..af569131 100644 --- a/examples/export_multimodal.py +++ b/examples/export_multimodal.py @@ -5,8 +5,7 @@ from pathlib import Path import pandas as pd -from docling.datamodel.base_models import ConversionStatus, InputFormat -from docling.datamodel.document import DocumentConversionInput +from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from docling.utils.export import generate_multimodal_pages @@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") output_dir = Path("./scratch") - input_files = DocumentConversionInput.from_paths(input_doc_paths) - # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. @@ -41,53 +36,45 @@ def main(): start_time = time.time() - converted_docs = doc_converter.convert_batch(input_files) + conv_res = doc_converter.convert(input_doc_path) - success_count = 0 - failure_count = 0 output_dir.mkdir(parents=True, exist_ok=True) - for doc in converted_docs: - if doc.status != ConversionStatus.SUCCESS: - _log.info(f"Document {doc.input.file} failed to convert.") - failure_count += 1 - continue - rows = [] - for ( - content_text, - content_md, - content_dt, - page_cells, - page_segments, - page, - ) in generate_multimodal_pages(doc): + rows = [] + for ( + content_text, + content_md, + content_dt, + page_cells, + page_segments, + page, + ) in generate_multimodal_pages(conv_res): - dpi = page._default_image_scale * 72 + dpi = page._default_image_scale * 72 - rows.append( - { - "document": doc.input.file.name, - "hash": doc.input.document_hash, - "page_hash": page.page_hash, - "image": { - "width": page.image.width, - "height": page.image.height, - "bytes": page.image.tobytes(), - }, - "cells": page_cells, - "contents": content_text, - "contents_md": content_md, - "contents_dt": content_dt, - "segments": page_segments, - "extra": { - "page_num": page.page_no + 1, - "width_in_points": page.size.width, - "height_in_points": page.size.height, - "dpi": dpi, - }, - } - ) - success_count += 1 + rows.append( + { + "document": conv_res.input.file.name, + "hash": conv_res.input.document_hash, + "page_hash": page.page_hash, + "image": { + "width": page.image.width, + "height": page.image.height, + "bytes": page.image.tobytes(), + }, + "cells": page_cells, + "contents": content_text, + "contents_md": content_md, + "contents_dt": content_dt, + "segments": page_segments, + "extra": { + "page_num": page.page_no + 1, + "width_in_points": page.size.width, + "height_in_points": page.size.height, + "dpi": dpi, + }, + } + ) # Generate one parquet from all documents df = pd.json_normalize(rows) @@ -97,12 +84,9 @@ def main(): end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info( + f"Document converted and multimodal pages generated in {end_time:.2f} seconds." + ) # This block demonstrates how the file can be opened with the HF datasets library # from datasets import Dataset diff --git a/examples/export_tables.py b/examples/export_tables.py index 720e8c67..79a3333d 100644 --- a/examples/export_tables.py +++ b/examples/export_tables.py @@ -4,8 +4,6 @@ from pathlib import Path import pandas as pd -from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import DocumentConversionInput from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -14,59 +12,39 @@ _log = logging.getLogger(__name__) def main(): logging.basicConfig(level=logging.INFO) - input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - ] + input_doc_path = Path("./tests/data/2206.01062.pdf") output_dir = Path("./scratch") - input_files = DocumentConversionInput.from_paths(input_doc_paths) - doc_converter = DocumentConverter() start_time = time.time() - conv_results = doc_converter.convert_batch(input_files) + conv_res = doc_converter.convert(input_doc_path) - success_count = 0 - failure_count = 0 output_dir.mkdir(parents=True, exist_ok=True) - for conv_res in conv_results: - if conv_res.status != ConversionStatus.SUCCESS: - _log.info(f"Document {conv_res.input.file} failed to convert.") - failure_count += 1 - continue - doc_filename = conv_res.input.file.stem + doc_filename = conv_res.input.file.stem - # Export tables - for table_ix, table in enumerate(conv_res.legacy_output.tables): - table_df: pd.DataFrame = table.export_to_dataframe() - print(f"## Table {table_ix}") - print(table_df.to_markdown()) + # Export tables + for table_ix, table in enumerate(conv_res.legacy_output.tables): + table_df: pd.DataFrame = table.export_to_dataframe() + print(f"## Table {table_ix}") + print(table_df.to_markdown()) - # Save the table as csv - element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" - _log.info(f"Saving CSV table to {element_csv_filename}") - table_df.to_csv(element_csv_filename) + # Save the table as csv + element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" + _log.info(f"Saving CSV table to {element_csv_filename}") + table_df.to_csv(element_csv_filename) - # Save the table as html - element_html_filename = ( - output_dir / f"{doc_filename}-table-{table_ix+1}.html" - ) - _log.info(f"Saving HTML table to {element_html_filename}") - with element_html_filename.open("w") as fp: - fp.write(table.export_to_html()) - - success_count += 1 + # Save the table as html + element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html" + _log.info(f"Saving HTML table to {element_html_filename}") + with element_html_filename.open("w") as fp: + fp.write(table.export_to_html()) end_time = time.time() - start_time - _log.info(f"All documents were converted in {end_time:.2f} seconds.") - - if failure_count > 0: - raise RuntimeError( - f"The example failed converting {failure_count} on {len(input_doc_paths)}." - ) + _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.") if __name__ == "__main__": diff --git a/examples/minimal.py b/examples/minimal.py index fb84cca4..55cdfc46 100644 --- a/examples/minimal.py +++ b/examples/minimal.py @@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL converter = DocumentConverter() -result = converter.convert_single(source) +result = converter.convert(source) print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]" # if the legacy output is needed, use this version # print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]" diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index f086bae2..37bb1b1a 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -6,7 +6,6 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import DocumentConversionInput from docling.document_converter import ( DocumentConverter, FormatOption, @@ -28,7 +27,6 @@ input_paths = [ Path("tests/data/2206.01062.pdf"), # Path("tests/data/2305.03393v1-pg9-img.png"), ] -input = DocumentConversionInput.from_paths(input_paths) ## for defaults use: # doc_converter = DocumentConverter() @@ -52,12 +50,12 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal }, ) -conv_results = doc_converter.convert_batch(input) +conv_results = doc_converter.convert_all(input_paths) for res in conv_results: out_path = Path("./scratch") print( - f"Document {res.input.file.name} converted with status {res.status}." + f"Document {res.input.file.name} converted." f"\nSaved markdown output to: {str(out_path)}" ) # print(res.experimental.export_to_markdown()) diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index d7432a10..c18a7a5b 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -3,7 +3,7 @@ from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 @@ -48,7 +48,7 @@ def test_e2e_conversions(): for pdf_path in pdf_paths: print(f"converting {pdf_path}") - doc_result: ConversionResult = converter.convert_single(pdf_path) + doc_result: ConversionResult = converter.convert(pdf_path) verify_conversion_result_v1( input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1 diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index ee7f3931..86c22554 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -8,7 +8,6 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, OcrOptions, PdfPipelineOptions, - PipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -90,7 +89,7 @@ def test_e2e_conversions(): for pdf_path in pdf_paths: print(f"converting {pdf_path}") - doc_result: ConversionResult = converter.convert_single(pdf_path) + doc_result: ConversionResult = converter.convert(pdf_path) # Save conversions # save_output(pdf_path, doc_result, None) diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 80f5ea4e..9ef3d131 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -5,8 +5,7 @@ import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import DocumentStream, InputFormat -from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 @@ -37,39 +36,24 @@ def converter(): return converter -def test_convert_single(converter: DocumentConverter): +def test_convert_path(converter: DocumentConverter): pdf_path = get_pdf_path() print(f"converting {pdf_path}") - doc_result: ConversionResult = converter.convert_single(pdf_path) + doc_result = converter.convert(pdf_path) verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) -def test_batch_path(converter: DocumentConverter): - - pdf_path = get_pdf_path() - print(f"converting {pdf_path}") - - conv_input = DocumentConversionInput.from_paths([pdf_path]) - - results = converter.convert_batch(conv_input) - for doc_result in results: - verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) - verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) - - -def test_batch_bytes(converter: DocumentConverter): +def test_convert_stream(converter: DocumentConverter): pdf_path = get_pdf_path() print(f"converting {pdf_path}") buf = BytesIO(pdf_path.open("rb").read()) - docs = [DocumentStream(name=pdf_path.name, stream=buf)] - conv_input = DocumentConversionInput.from_streams(docs) + stream = DocumentStream(name=pdf_path.name, stream=buf) - results = converter.convert_batch(conv_input) - for doc_result in results: - verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) - verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) + doc_result = converter.convert(stream) + verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result) + verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result) diff --git a/tests/test_options.py b/tests/test_options.py index 8b35811b..ad6c7a45 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path): for converter in get_converters_with_table_options(): print(f"converting {test_doc_path}") - doc_result: ConversionResult = converter.convert_single(test_doc_path) + doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS