feat!: Docling v2 (#117)

--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-12-11 14:18:30 +00:00 · 2024-10-16 21:02:03 +02:00
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -1,84 +1,179 @@
-import functools
 import logging
-import tempfile
+import sys
 import time
-import traceback
+from functools import partial
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Type

-import requests
-from PIL import ImageDraw
-from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
+from pydantic import BaseModel, ConfigDict, model_validator, validate_call

-from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.datamodel.base_models import (
-    AssembledUnit,
-    AssembleOptions,
-    ConversionStatus,
-    DoclingComponentType,
-    ErrorItem,
-    Page,
-)
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
 from docling.datamodel.document import (
    ConversionResult,
-    DocumentConversionInput,
    InputDocument,
+    _DocumentConversionInput,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import settings
-from docling.models.ds_glm_model import GlmModel
-from docling.models.page_assemble_model import PageAssembleModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-from docling.pipeline.standard_model_pipeline import StandardModelPipeline
-from docling.utils.utils import chunkify, create_hash
+from docling.datamodel.settings import DocumentLimits, settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.utils.utils import chunkify

 _log = logging.getLogger(__name__)


+class FormatOption(BaseModel):
+    pipeline_cls: Type[BasePipeline]
+    pipeline_options: Optional[PipelineOptions] = None
+    backend: Type[AbstractDocumentBackend]
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    @model_validator(mode="after")
+    def set_optional_field_default(self) -> "FormatOption":
+        if self.pipeline_options is None:
+            self.pipeline_options = self.pipeline_cls.get_default_options()
+        return self
+
+
+class WordFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
+
+
+class PowerpointFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
+
+
+class HTMLFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
+
+
+class PdfFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+
+
+class ImageFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+
+
+_format_to_default_options = {
+    InputFormat.DOCX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
+    ),
+    InputFormat.PPTX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
+    ),
+    InputFormat.HTML: FormatOption(
+        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+    ),
+    InputFormat.IMAGE: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+    InputFormat.PDF: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+}
+
+
 class DocumentConverter:
-    _default_download_filename = "file.pdf"
+    _default_download_filename = "file"

    def __init__(
        self,
-        artifacts_path: Optional[Union[Path, str]] = None,
-        pipeline_options: PipelineOptions = PipelineOptions(),
-        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
-        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
-        assemble_options: AssembleOptions = AssembleOptions(),
+        allowed_formats: Optional[List[InputFormat]] = None,
+        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
    ):
-        if not artifacts_path:
-            artifacts_path = self.download_models_hf()
+        self.allowed_formats = allowed_formats
+        self.format_to_options = format_options

-        artifacts_path = Path(artifacts_path)
+        if self.allowed_formats is None:
+            # if self.format_to_options is not None:
+            #    self.allowed_formats = self.format_to_options.keys()
+            # else:
+            self.allowed_formats = [e for e in InputFormat]  # all formats

-        self.model_pipeline = pipeline_cls(
-            artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        if self.format_to_options is None:
+            self.format_to_options = _format_to_default_options
+        else:
+            for f in self.allowed_formats:
+                if f not in self.format_to_options.keys():
+                    _log.debug(f"Requested format {f} will use default options.")
+                    self.format_to_options[f] = _format_to_default_options[f]
+
+            remove_keys = []
+            for f in self.format_to_options.keys():
+                if f not in self.allowed_formats:
+                    remove_keys.append(f)
+
+            for f in remove_keys:
+                self.format_to_options.pop(f)
+
+        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
+
+    @validate_call(config=ConfigDict(strict=True))
+    def convert(
+        self,
+        source: Path | str | DocumentStream,  # TODO review naming
+        raises_on_error: bool = True,
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> ConversionResult:
+
+        all_res = self.convert_all(
+            source=[source],
+            raises_on_error=raises_on_error,
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
        )
+        return next(all_res)

-        self.page_assemble_model = PageAssembleModel(config={})
-        self.glm_model = GlmModel(config={})
-        self.pdf_backend = pdf_backend
-        self.assemble_options = assemble_options
-
-    @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.0.0",
+    @validate_call(config=ConfigDict(strict=True))
+    def convert_all(
+        self,
+        source: Iterable[Path | str | DocumentStream],  # TODO review naming
+        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> Iterator[ConversionResult]:
+        limits = DocumentLimits(
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
        )
+        conv_input = _DocumentConversionInput(
+            path_or_stream_iterator=source,
+            limit=limits,
+        )
+        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+        for conv_res in conv_res_iter:
+            if raises_on_error and conv_res.status not in {
+                ConversionStatus.SUCCESS,
+                ConversionStatus.PARTIAL_SUCCESS,
+            }:
+                raise RuntimeError(
+                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
+                )
+            else:
+                yield conv_res

-        return Path(download_path)
-
-    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
+    def _convert(
+        self, conv_input: _DocumentConversionInput, raises_on_error: bool
+    ) -> Iterator[ConversionResult]:
+        assert self.format_to_options is not None

        for input_batch in chunkify(
-            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
+            conv_input.docs(self.format_to_options),
+            settings.perf.doc_batch_size,  # pass format_options
        ):
            _log.info(f"Going to convert document batch...")
            # parallel processing only within input_batch
@@ -87,211 +182,79 @@ class DocumentConverter:
            # ) as pool:
            #   yield from pool.map(self.process_document, input_batch)

-            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
-            yield from map(self._process_document, input_batch)
+            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
+            for item in map(
+                partial(self._process_document, raises_on_error=raises_on_error),
+                input_batch,
+            ):
+                if item is not None:
+                    yield item

-    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
-        """Convert a single document.
+    def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
+        assert self.format_to_options is not None

-        Args:
-            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
+        fopt = self.format_to_options.get(doc.format)

-        Raises:
-            ValueError: If source is of unexpected type.
-            RuntimeError: If conversion fails.
+        if fopt is None:
+            raise RuntimeError(f"Could not get pipeline for document {doc.file}")
+        else:
+            pipeline_class = fopt.pipeline_cls
+            pipeline_options = fopt.pipeline_options

-        Returns:
-            ConversionResult: The conversion result object.
-        """
-        with tempfile.TemporaryDirectory() as temp_dir:
-            try:
-                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
-                res = requests.get(http_url, stream=True)
-                res.raise_for_status()
-                fname = None
-                # try to get filename from response header
-                if cont_disp := res.headers.get("Content-Disposition"):
-                    for par in cont_disp.strip().split(";"):
-                        # currently only handling directive "filename" (not "*filename")
-                        if (split := par.split("=")) and split[0].strip() == "filename":
-                            fname = "=".join(split[1:]).strip().strip("'\"") or None
-                            break
-                # otherwise, use name from URL:
-                if fname is None:
-                    fname = Path(http_url.path).name or self._default_download_filename
-                local_path = Path(temp_dir) / fname
-                with open(local_path, "wb") as f:
-                    for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
-                        f.write(chunk)
-            except ValidationError:
-                try:
-                    local_path = TypeAdapter(Path).validate_python(source)
-                except ValidationError:
-                    raise ValueError(
-                        f"Unexpected file path type encountered: {type(source)}"
-                    )
-            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
-            conv_res_iter = self.convert(conv_inp)
-            conv_res: ConversionResult = next(conv_res_iter)
-        if conv_res.status not in {
-            ConversionStatus.SUCCESS,
-            ConversionStatus.PARTIAL_SUCCESS,
-        }:
-            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
-        return conv_res
-
-    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
-        start_doc_time = time.time()
-        conv_res = ConversionResult(input=in_doc)
-
-        _log.info(f"Processing document {in_doc.file.name}")
-
-        if not in_doc.valid:
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
-
-        for i in range(0, in_doc.page_count):
-            conv_res.pages.append(Page(page_no=i))
-
-        all_assembled_pages = []
-
-        try:
-            # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
-                start_pb_time = time.time()
-                # Pipeline
-
-                # 1. Initialise the page resources
-                init_pages = map(
-                    functools.partial(self._initialize_page, in_doc), page_batch
-                )
-
-                # 2. Populate page image
-                pages_with_images = map(
-                    functools.partial(self._populate_page_images, in_doc), init_pages
-                )
-
-                # 3. Populate programmatic page cells
-                pages_with_cells = map(
-                    functools.partial(self._parse_page_cells, in_doc),
-                    pages_with_images,
-                )
-
-                # 4. Run pipeline stages
-                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
-
-                # 5. Assemble page elements (per page)
-                assembled_pages = self.page_assemble_model(pipeline_pages)
-
-                # exhaust assembled_pages
-                for assembled_page in assembled_pages:
-                    # Free up mem resources before moving on with next batch
-
-                    # Remove page images (can be disabled)
-                    if self.assemble_options.images_scale is None:
-                        assembled_page._image_cache = {}
-
-                    # Unload backend
-                    assembled_page._backend.unload()
-
-                    all_assembled_pages.append(assembled_page)
-
-                end_pb_time = time.time() - start_pb_time
-                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
-
-            conv_res.pages = all_assembled_pages
-            self._assemble_doc(conv_res)
-
-            status = ConversionStatus.SUCCESS
-            for page in conv_res.pages:
-                if not page._backend.is_valid():
-                    conv_res.errors.append(
-                        ErrorItem(
-                            component_type=DoclingComponentType.PDF_BACKEND,
-                            module_name=type(page._backend).__name__,
-                            error_message=f"Page {page.page_no} failed to parse.",
-                        )
-                    )
-                    status = ConversionStatus.PARTIAL_SUCCESS
-
-            conv_res.status = status
-
-        except Exception as e:
-            conv_res.status = ConversionStatus.FAILURE
-            trace = "\n".join(traceback.format_exception(e))
-            _log.info(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
-                f"{trace}"
+        assert pipeline_options is not None
+        # TODO this will ignore if different options have been defined for the same pipeline class.
+        if (
+            pipeline_class not in self.initialized_pipelines
+            or self.initialized_pipelines[pipeline_class].pipeline_options
+            != pipeline_options
+        ):
+            self.initialized_pipelines[pipeline_class] = pipeline_class(
+                pipeline_options=pipeline_options
            )
+        return self.initialized_pipelines[pipeline_class]

-        finally:
-            # Always unload the PDF backend, even in case of failure
-            if in_doc._backend:
-                in_doc._backend.unload()
+    def _process_document(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> Optional[ConversionResult]:
+        assert self.allowed_formats is not None
+        assert in_doc.format in self.allowed_formats
+
+        start_doc_time = time.time()
+
+        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)

        end_doc_time = time.time() - start_doc_time
        _log.info(
-            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
+            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
        )

        return conv_res

-    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
-    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
-        page._backend = doc._backend.load_page(page.page_no)
-        page.size = page._backend.get_size()
-        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
+    def _execute_pipeline(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> ConversionResult:
+        if in_doc.valid:
+            pipeline = self._get_pipeline(in_doc)
+            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+                if raises_on_error:
+                    raise RuntimeError(
+                        f"No pipeline could be initialized for {in_doc.file}."
+                    )
+                else:
+                    conv_res = ConversionResult(input=in_doc)
+                    conv_res.status = ConversionStatus.FAILURE
+                    return conv_res

-        return page
+            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)

-    # Generate the page image and store it in the page object
-    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        # default scale
-        page.get_image(
-            scale=1.0
-        )  # puts the page image on the image cache at default scale
+        else:
+            if raises_on_error:
+                raise RuntimeError(f"Input document {in_doc.file} is not valid.")

-        # user requested scales
-        if self.assemble_options.images_scale is not None:
-            page._default_image_scale = self.assemble_options.images_scale
-            page.get_image(
-                scale=self.assemble_options.images_scale
-            )  # this will trigger storing the image in the internal cache
+            else:
+                # invalid doc or not of desired format
+                conv_res = ConversionResult(input=in_doc)
+                conv_res.status = ConversionStatus.FAILURE
+                # TODO add error log why it failed.

-        return page
-
-    # Extract and populate the page cells and store it in the page object
-    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
-        page.cells = page._backend.get_text_cells()
-
-        # DEBUG code:
-        def draw_text_boxes(image, cells):
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-            image.show()
-
-        # draw_text_boxes(page.get_image(scale=1.0), cells)
-
-        return page
-
-    def _assemble_doc(self, conv_res: ConversionResult):
-        all_elements = []
-        all_headers = []
-        all_body = []
-
-        for p in conv_res.pages:
-
-            for el in p.assembled.body:
-                all_body.append(el)
-            for el in p.assembled.headers:
-                all_headers.append(el)
-            for el in p.assembled.elements:
-                all_elements.append(el)
-
-        conv_res.assembled = AssembledUnit(
-            elements=all_elements, headers=all_headers, body=all_body
-        )
-
-        conv_res.output = self.glm_model(conv_res)
+        return conv_res