mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
feat: add simplified single-doc conversion
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
3eca8b8485
commit
e5a3bec356
@ -1,11 +1,16 @@
|
|||||||
|
import cgi
|
||||||
import functools
|
import functools
|
||||||
import logging
|
import logging
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type, Union
|
from typing import Iterable, Optional, Type, Union
|
||||||
|
from urllib.request import urlopen, urlretrieve
|
||||||
|
|
||||||
|
from docling_core.types import Document
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
@ -32,6 +37,7 @@ _log = logging.getLogger(__name__)
|
|||||||
class DocumentConverter:
|
class DocumentConverter:
|
||||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||||
_table_model_path = "model_artifacts/tableformer"
|
_table_model_path = "model_artifacts/tableformer"
|
||||||
|
_default_download_filename = "file.pdf"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -80,6 +86,51 @@ class DocumentConverter:
|
|||||||
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
||||||
yield from map(self.process_document, input_batch)
|
yield from map(self.process_document, input_batch)
|
||||||
|
|
||||||
|
def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
|
||||||
|
"""Convert a single document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If source is of unexpected type.
|
||||||
|
RuntimeError: If conversion fails.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: The converted document object.
|
||||||
|
"""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
try:
|
||||||
|
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
||||||
|
with urlopen(str(source)) as resp:
|
||||||
|
cont_disp = resp.info().get("Content-Disposition")
|
||||||
|
content = resp.read()
|
||||||
|
if cont_disp:
|
||||||
|
_, params = cgi.parse_header(cont_disp)
|
||||||
|
filename = params.get("filename", self._default_download_filename)
|
||||||
|
else:
|
||||||
|
filename = http_url.path or self._default_download_filename
|
||||||
|
local_path = Path(temp_dir) / filename
|
||||||
|
with open(local_path, "wb") as f:
|
||||||
|
f.write(content)
|
||||||
|
except ValidationError:
|
||||||
|
try:
|
||||||
|
local_path = TypeAdapter(Path).validate_python(source)
|
||||||
|
except ValidationError:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unexpected file path type encountered: {type(source)}"
|
||||||
|
)
|
||||||
|
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
||||||
|
converted_docs_iter = self.convert(conv_inp)
|
||||||
|
converted_doc: ConvertedDocument = next(converted_docs_iter)
|
||||||
|
if converted_doc.status not in {
|
||||||
|
ConversionStatus.SUCCESS,
|
||||||
|
ConversionStatus.SUCCESS_WITH_ERRORS,
|
||||||
|
}:
|
||||||
|
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
|
||||||
|
doc = converted_doc.to_ds_document()
|
||||||
|
return doc
|
||||||
|
|
||||||
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
||||||
start_doc_time = time.time()
|
start_doc_time = time.time()
|
||||||
converted_doc = ConvertedDocument(input=in_doc)
|
converted_doc = ConvertedDocument(input=in_doc)
|
||||||
|
Loading…
Reference in New Issue
Block a user