feat: add simplified single-doc conversion

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-07-26 20:14:47 +00:00 · 2024-07-25 18:09:16 +02:00 · 2024-07-25 18:09:16 +02:00 · e5a3bec356
commit e5a3bec356
parent 3eca8b8485
1 changed files with 51 additions and 0 deletions
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,11 +1,16 @@
 import cgi
 import functools
 import logging
 import tempfile
 import time
 import traceback
 from pathlib import Path
 from typing import Iterable, Optional, Type, Union
 from urllib.request import urlopen, urlretrieve
 from docling_core.types import Document
 from PIL import ImageDraw
 from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
@ -32,6 +37,7 @@ _log = logging.getLogger(__name__)
 class DocumentConverter:
    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
    _table_model_path = "model_artifacts/tableformer"
    _default_download_filename = "file.pdf"
    def __init__(
        self,
@ -80,6 +86,51 @@ class DocumentConverter:
            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
            yield from map(self.process_document, input_batch)
    def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
        """Convert a single document.
        Args:
            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
        Raises:
            ValueError: If source is of unexpected type.
            RuntimeError: If conversion fails.
        Returns:
            Document: The converted document object.
        """
        with tempfile.TemporaryDirectory() as temp_dir:
            try:
                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
                with urlopen(str(source)) as resp:
                    cont_disp = resp.info().get("Content-Disposition")
                    content = resp.read()
                if cont_disp:
                    _, params = cgi.parse_header(cont_disp)
                    filename = params.get("filename", self._default_download_filename)
                else:
                    filename = http_url.path or self._default_download_filename
                local_path = Path(temp_dir) / filename
                with open(local_path, "wb") as f:
                    f.write(content)
            except ValidationError:
                try:
                    local_path = TypeAdapter(Path).validate_python(source)
                except ValidationError:
                    raise ValueError(
                        f"Unexpected file path type encountered: {type(source)}"
                    )
            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
            converted_docs_iter = self.convert(conv_inp)
            converted_doc: ConvertedDocument = next(converted_docs_iter)
        if converted_doc.status not in {
            ConversionStatus.SUCCESS,
            ConversionStatus.SUCCESS_WITH_ERRORS,
        }:
            raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
        doc = converted_doc.to_ds_document()
        return doc
    def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
        start_doc_time = time.time()
        converted_doc = ConvertedDocument(input=in_doc)