mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 22:28:31 +00:00
feat(backend): add generic options support and HTML image handling modes (#2011)
* feat: add backend options support to document backends Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * feat: enhance document backends with generic backend options and improve HTML image handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * Refactor tests for declarativebackend Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): improve image caption handling and ensure backend options are set correctly Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix: enhance HTML backend image handling and add support for local file paths Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: Add ground truth data for test data Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): skip loading SVG files in image data handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): simplify backend options and address gaps Backend options for DeclarativeDocumentBackend classes and only when necessary. Refactor caption parsing in 'img' elements and remove dummy text. Replace deprecated annotations from Typing library with native types. Replace typing annotations according to pydantic guidelines. Some documentation with pydantic annotations. Fix diff issue with test files. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(html): add tests and fix bugs Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): refactor backend options Move backend option classes to its own module within datamodel package. Rename 'source_location' with 'source_uri' in HTMLBackendOptions. Rename 'image_fetch' with 'fetch_images' in HTMLBackendOptions. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(markdown): create a class for the markdown backend options Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
@@ -1,10 +1,12 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Set, Union
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
@@ -35,7 +37,7 @@ class AbstractDocumentBackend(ABC):
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def supported_formats(cls) -> Set["InputFormat"]:
|
||||
def supported_formats(cls) -> set["InputFormat"]:
|
||||
pass
|
||||
|
||||
|
||||
@@ -58,6 +60,20 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
||||
straight without a recognition pipeline.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: BackendOptions = DeclarativeBackendOptions(),
|
||||
) -> None:
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.options: BackendOptions = options
|
||||
|
||||
@abstractmethod
|
||||
def convert(self) -> DoclingDocument:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> BackendOptions:
|
||||
return DeclarativeBackendOptions()
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Set, Union
|
||||
from typing import Final, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128
|
||||
|
||||
|
||||
class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self.path_or_stream = path_or_stream
|
||||
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.ASCIIDOC}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Optional, Union, cast
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||
from bs4.element import PreformattedString
|
||||
from docling_core.types.doc import (
|
||||
@@ -17,6 +20,7 @@ from docling_core.types.doc import (
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
PictureItem,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
TableCell,
|
||||
@@ -24,13 +28,18 @@ from docling_core.types.doc import (
|
||||
TableItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer, Formatting, Script
|
||||
from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from pydantic import AnyUrl, BaseModel, ValidationError
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.backend_options import HTMLBackendOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docling.exceptions import OperationNotAllowed
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -43,6 +52,7 @@ _BLOCK_TAGS: Final = {
|
||||
"details",
|
||||
"figure",
|
||||
"footer",
|
||||
"img",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
@@ -186,11 +196,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self,
|
||||
in_doc: InputDocument,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
original_url: Optional[AnyUrl] = None,
|
||||
options: HTMLBackendOptions = HTMLBackendOptions(),
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
self.soup: Optional[Tag] = None
|
||||
self.path_or_stream = path_or_stream
|
||||
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
||||
self.base_path: Optional[str] = str(options.source_uri)
|
||||
|
||||
# Initialize the parents for the hierarchy
|
||||
self.max_levels = 10
|
||||
@@ -200,7 +211,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
self.hyperlink: Union[AnyUrl, Path, None] = None
|
||||
self.original_url = original_url
|
||||
self.format_tags: list[str] = []
|
||||
|
||||
try:
|
||||
@@ -236,6 +246,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.HTML}
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def get_default_options(cls) -> HTMLBackendOptions:
|
||||
return HTMLBackendOptions()
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
_log.debug("Starting HTML conversion...")
|
||||
@@ -261,7 +276,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
content_layer=ContentLayer.FURNITURE,
|
||||
)
|
||||
# remove script and style tags
|
||||
for tag in self.soup(["script", "style"]):
|
||||
for tag in self.soup(["script", "noscript", "style"]):
|
||||
tag.decompose()
|
||||
# remove any hidden tag
|
||||
for tag in self.soup(hidden=True):
|
||||
@@ -291,6 +306,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._walk(content, doc)
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _is_remote_url(value: str) -> bool:
|
||||
parsed = urlparse(value)
|
||||
return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
|
||||
|
||||
def _resolve_relative_path(self, loc: str) -> str:
|
||||
abs_loc = loc
|
||||
|
||||
if self.base_path:
|
||||
if loc.startswith("//"):
|
||||
# Protocol-relative URL - default to https
|
||||
abs_loc = "https:" + loc
|
||||
elif not loc.startswith(("http://", "https://", "data:", "file://")):
|
||||
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
|
||||
abs_loc = urljoin(self.base_path, loc)
|
||||
elif self.base_path: # local fetch
|
||||
# For local files, resolve relative to the HTML file location
|
||||
abs_loc = str(Path(self.base_path).parent / loc)
|
||||
|
||||
_log.debug(f"Resolved location {loc} to {abs_loc}")
|
||||
return abs_loc
|
||||
|
||||
@staticmethod
|
||||
def group_cell_elements(
|
||||
group_name: str,
|
||||
@@ -520,7 +557,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if name == "img":
|
||||
flush_buffer()
|
||||
im_ref3 = self._emit_image(node, doc)
|
||||
added_refs.append(im_ref3)
|
||||
if im_ref3:
|
||||
added_refs.append(im_ref3)
|
||||
elif name in _FORMAT_TAG_MAP:
|
||||
with self._use_format([name]):
|
||||
wk = self._walk(node, doc)
|
||||
@@ -669,8 +707,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
if isinstance(this_href, str) and this_href:
|
||||
old_hyperlink = self.hyperlink
|
||||
if self.original_url is not None:
|
||||
this_href = urljoin(str(self.original_url), str(this_href))
|
||||
this_href = self._resolve_relative_path(this_href)
|
||||
# ugly fix for relative links since pydantic does not support them.
|
||||
try:
|
||||
new_hyperlink = AnyUrl(this_href)
|
||||
@@ -837,7 +874,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
im_ref = self._emit_image(img_tag, doc)
|
||||
added_ref.append(im_ref)
|
||||
if im_ref:
|
||||
added_ref.append(im_ref)
|
||||
return added_ref
|
||||
|
||||
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
|
||||
@@ -1003,7 +1041,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
img_tag = tag.find("img")
|
||||
if isinstance(img_tag, Tag):
|
||||
im_ref = self._emit_image(img_tag, doc)
|
||||
added_refs.append(im_ref)
|
||||
if im_ref is not None:
|
||||
added_refs.append(im_ref)
|
||||
|
||||
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
||||
heading_refs = self._handle_heading(tag, doc)
|
||||
@@ -1061,7 +1100,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
im_ref2 = self._emit_image(tag, doc)
|
||||
added_refs.append(im_ref2)
|
||||
if im_ref2 is not None:
|
||||
added_refs.append(im_ref2)
|
||||
|
||||
elif tag_name in {"pre"}:
|
||||
# handle monospace code snippets (pre).
|
||||
@@ -1092,10 +1132,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._walk(tag, doc)
|
||||
return added_refs
|
||||
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
|
||||
figure = img_tag.find_parent("figure")
|
||||
caption: AnnotatedTextList = AnnotatedTextList()
|
||||
|
||||
parent = self.parents[self.level]
|
||||
|
||||
# check if the figure has a link - this is HACK:
|
||||
def get_img_hyperlink(img_tag):
|
||||
this_parent = img_tag.parent
|
||||
@@ -1106,9 +1148,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return None
|
||||
|
||||
if img_hyperlink := get_img_hyperlink(img_tag):
|
||||
caption.append(
|
||||
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
|
||||
)
|
||||
img_text = img_tag.get("alt") or ""
|
||||
caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
|
||||
|
||||
if isinstance(figure, Tag):
|
||||
caption_tag = figure.find("figcaption", recursive=False)
|
||||
@@ -1135,13 +1176,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
hyperlink=caption_anno_text.hyperlink,
|
||||
)
|
||||
|
||||
src_loc: str = self._get_attr_as_string(img_tag, "src")
|
||||
if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
|
||||
# Do not fetch the image, just add a placeholder
|
||||
placeholder: PictureItem = doc.add_picture(
|
||||
caption=caption_item,
|
||||
parent=parent,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
return placeholder.get_ref()
|
||||
|
||||
src_loc = self._resolve_relative_path(src_loc)
|
||||
img_ref = self._create_image_ref(src_loc)
|
||||
|
||||
docling_pic = doc.add_picture(
|
||||
image=img_ref,
|
||||
caption=caption_item,
|
||||
parent=self.parents[self.level],
|
||||
parent=parent,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
return docling_pic.get_ref()
|
||||
|
||||
def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
|
||||
try:
|
||||
img_data = self._load_image_data(src_url)
|
||||
if img_data:
|
||||
img = Image.open(BytesIO(img_data))
|
||||
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
|
||||
except (
|
||||
requests.HTTPError,
|
||||
ValidationError,
|
||||
UnidentifiedImageError,
|
||||
OperationNotAllowed,
|
||||
TypeError,
|
||||
ValueError,
|
||||
) as e:
|
||||
warnings.warn(f"Could not process an image from {src_url}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _load_image_data(self, src_loc: str) -> Optional[bytes]:
|
||||
if src_loc.lower().endswith(".svg"):
|
||||
_log.debug(f"Skipping SVG file: {src_loc}")
|
||||
return None
|
||||
|
||||
if HTMLDocumentBackend._is_remote_url(src_loc):
|
||||
if not self.options.enable_remote_fetch:
|
||||
raise OperationNotAllowed(
|
||||
"Fetching remote resources is only allowed when set explicitly. "
|
||||
"Set options.enable_remote_fetch=True."
|
||||
)
|
||||
response = requests.get(src_loc, stream=True)
|
||||
response.raise_for_status()
|
||||
return response.content
|
||||
elif src_loc.startswith("data:"):
|
||||
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
|
||||
return base64.b64decode(data)
|
||||
|
||||
if src_loc.startswith("file://"):
|
||||
src_loc = src_loc[7:]
|
||||
|
||||
if not self.options.enable_local_fetch:
|
||||
raise OperationNotAllowed(
|
||||
"Fetching local resources is only allowed when set explicitly. "
|
||||
"Set options.enable_local_fetch=True."
|
||||
)
|
||||
# add check that file exists and can read
|
||||
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
|
||||
with open(src_loc, "rb") as f:
|
||||
return f.read()
|
||||
else:
|
||||
raise ValueError("File does not exist or it is not readable.")
|
||||
|
||||
@staticmethod
|
||||
def get_text(item: PageElement) -> str:
|
||||
"""Concatenate all child strings of a PageElement.
|
||||
@@ -1238,3 +1344,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
return int_spans
|
||||
|
||||
@staticmethod
|
||||
def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
|
||||
"""Get attribute value as string, handling list values."""
|
||||
value = tag.get(attr)
|
||||
if not value:
|
||||
return default
|
||||
|
||||
return value[0] if isinstance(value, list) else value
|
||||
|
||||
@@ -24,10 +24,16 @@ from docling_core.types.doc import (
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from marko import Markdown
|
||||
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
||||
from typing_extensions import Annotated
|
||||
from typing_extensions import Annotated, override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.backend_options import (
|
||||
HTMLBackendOptions,
|
||||
MarkdownBackendOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@@ -88,8 +94,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return shortened_text
|
||||
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
@override
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: InputDocument,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: MarkdownBackendOptions = MarkdownBackendOptions(),
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
|
||||
_log.debug("Starting MarkdownDocumentBackend...")
|
||||
|
||||
@@ -580,9 +592,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
format=InputFormat.HTML,
|
||||
backend=html_backend_cls,
|
||||
filename=self.file.name,
|
||||
backend_options=self.options,
|
||||
)
|
||||
html_backend_obj = html_backend_cls(
|
||||
in_doc=in_doc, path_or_stream=stream
|
||||
in_doc=in_doc,
|
||||
path_or_stream=stream,
|
||||
options=cast(HTMLBackendOptions, self.options),
|
||||
)
|
||||
doc = html_backend_obj.convert()
|
||||
else:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
from typing import Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.PPTX}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
|
||||
@@ -3,7 +3,7 @@ import re
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, List, Optional, Union
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.numbered_headers: dict[int, int] = {}
|
||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||
# Track processed textbox elements to avoid duplication
|
||||
self.processed_textbox_elements: List[int] = []
|
||||
self.processed_textbox_elements: list[int] = []
|
||||
self.docx_to_pdf_converter: Optional[Callable] = None
|
||||
self.docx_to_pdf_converter_init = False
|
||||
self.display_drawingml_warning = True
|
||||
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
textbox_elements: list,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
"""Process textbox content and add it to the document structure."""
|
||||
level = self._get_level()
|
||||
# Create a textbox group to contain all text from the textbox
|
||||
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
curr_level: Optional[int],
|
||||
text: str,
|
||||
is_numbered_style: bool = False,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
level = self._get_level()
|
||||
if isinstance(curr_level, int):
|
||||
if curr_level > level:
|
||||
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker: str,
|
||||
enumerated: bool,
|
||||
level: int,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], ListGroup):
|
||||
return elem_ref
|
||||
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
ilevel: int,
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
if not elements:
|
||||
return elem_ref
|
||||
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
table: Table = Table(element, docx_obj)
|
||||
num_rows = len(table.rows)
|
||||
num_cols = len(table.columns)
|
||||
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
||||
|
||||
provs_in_cell: List[RefItem] = []
|
||||
provs_in_cell: list[RefItem] = []
|
||||
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
|
||||
ref_for_rich_cell = provs_in_cell[0]
|
||||
rich_table_cell = False
|
||||
|
||||
def group_cell_elements(
|
||||
group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
|
||||
group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
|
||||
) -> RefItem:
|
||||
group_element = doc.add_group(
|
||||
label=GroupLabel.UNSPECIFIED,
|
||||
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def _handle_pictures(
|
||||
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
||||
) -> List[RefItem]:
|
||||
) -> list[RefItem]:
|
||||
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
||||
image_data: Optional[bytes] = None
|
||||
rId = drawing_blip[0].get(
|
||||
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
image_data = image_part.blob # Get the binary image data
|
||||
return image_data
|
||||
|
||||
elem_ref: List[RefItem] = []
|
||||
elem_ref: list[RefItem] = []
|
||||
level = self._get_level()
|
||||
# Open the BytesIO object with PIL to create an Image
|
||||
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
||||
|
||||
53
docling/datamodel/backend_options.py
Normal file
53
docling/datamodel/backend_options.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from pathlib import PurePath
|
||||
from typing import Annotated, Literal, Optional, Union
|
||||
|
||||
from pydantic import AnyUrl, BaseModel, Field
|
||||
|
||||
|
||||
class BaseBackendOptions(BaseModel):
|
||||
"""Common options for all declarative document backends."""
|
||||
|
||||
enable_remote_fetch: bool = Field(
|
||||
False, description="Enable remote resource fetching."
|
||||
)
|
||||
enable_local_fetch: bool = Field(
|
||||
False, description="Enable local resource fetching."
|
||||
)
|
||||
|
||||
|
||||
class DeclarativeBackendOptions(BaseBackendOptions):
|
||||
"""Default backend options for a declarative document backend."""
|
||||
|
||||
kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
|
||||
|
||||
|
||||
class HTMLBackendOptions(BaseBackendOptions):
|
||||
"""Options specific to the HTML backend.
|
||||
|
||||
This class can be extended to include options specific to HTML processing.
|
||||
"""
|
||||
|
||||
kind: Literal["html"] = Field("html", exclude=True, repr=False)
|
||||
fetch_images: bool = Field(
|
||||
False,
|
||||
description=(
|
||||
"Whether the backend should access remote or local resources to parse "
|
||||
"images in an HTML document."
|
||||
),
|
||||
)
|
||||
source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
|
||||
None,
|
||||
description=(
|
||||
"The URI that originates the HTML document. If provided, the backend "
|
||||
"will use it to resolve relative paths in the HTML document."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class MarkdownBackendOptions(HTMLBackendOptions):
|
||||
"""Options specific to the Markdown backend."""
|
||||
|
||||
|
||||
BackendOptions = Annotated[
|
||||
Union[DeclarativeBackendOptions, HTMLBackendOptions], Field(discriminator="kind")
|
||||
]
|
||||
@@ -8,14 +8,12 @@ from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
List,
|
||||
Annotated,
|
||||
Literal,
|
||||
Optional,
|
||||
Set,
|
||||
Type,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
import filetype
|
||||
@@ -54,8 +52,10 @@ from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
DeclarativeDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.backend_options import BackendOptions
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConfidenceReport,
|
||||
@@ -74,6 +74,7 @@ from docling.utils.utils import create_file_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import BaseFormatOption
|
||||
from docling.document_converter import FormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -102,32 +103,58 @@ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath
|
||||
document_hash: str # = None
|
||||
valid: bool = True
|
||||
limits: DocumentLimits = DocumentLimits()
|
||||
format: InputFormat # = None
|
||||
"""A document as an input of a Docling conversion."""
|
||||
|
||||
filesize: Optional[int] = None
|
||||
page_count: int = 0
|
||||
file: Annotated[
|
||||
PurePath, Field(description="A path representation the input document.")
|
||||
]
|
||||
document_hash: Annotated[
|
||||
str,
|
||||
Field(description="A stable hash of the path or stream of the input document."),
|
||||
]
|
||||
valid: bool = Field(True, description="Whether this is is a valid input document.")
|
||||
backend_options: Optional[BackendOptions] = Field(
|
||||
None, description="Custom options for declarative backends."
|
||||
)
|
||||
limits: DocumentLimits = Field(
|
||||
DocumentLimits(), description="Limits in the input document for the conversion."
|
||||
)
|
||||
format: Annotated[InputFormat, Field(description="The document format.")]
|
||||
|
||||
_backend: AbstractDocumentBackend # Internal PDF backend used
|
||||
filesize: Optional[int] = Field(
|
||||
None, description="Size of the input file, in bytes."
|
||||
)
|
||||
page_count: int = Field(0, description="Number of pages in the input document.")
|
||||
|
||||
_backend: AbstractDocumentBackend
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
format: InputFormat,
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
backend_options: Optional[BackendOptions] = None,
|
||||
filename: Optional[str] = None,
|
||||
limits: Optional[DocumentLimits] = None,
|
||||
):
|
||||
) -> None:
|
||||
super().__init__(
|
||||
file="", document_hash="", format=InputFormat.PDF
|
||||
file="",
|
||||
document_hash="",
|
||||
format=InputFormat.PDF,
|
||||
backend_options=backend_options,
|
||||
) # initialize with dummy values
|
||||
|
||||
self.limits = limits or DocumentLimits()
|
||||
self.format = format
|
||||
|
||||
# check for backend incompatibilities
|
||||
if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
|
||||
if not issubclass(
|
||||
type(backend_options), type(backend.get_default_options())
|
||||
):
|
||||
raise ValueError(
|
||||
"Incompatible types between backend and backend_options arguments."
|
||||
)
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
self.file = path_or_stream
|
||||
@@ -140,7 +167,8 @@ class InputDocument(BaseModel):
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
assert filename is not None, (
|
||||
"Can't construct InputDocument from stream without providing filename arg."
|
||||
"Can't construct InputDocument from stream without providing "
|
||||
"filename arg."
|
||||
)
|
||||
self.file = PurePath(filename)
|
||||
self.filesize = path_or_stream.getbuffer().nbytes
|
||||
@@ -175,7 +203,8 @@ class InputDocument(BaseModel):
|
||||
except RuntimeError as e:
|
||||
self.valid = False
|
||||
_log.exception(
|
||||
f"An unexpected error occurred while opening the document {self.file.name}",
|
||||
"An unexpected error occurred while opening the document "
|
||||
"f{self.file.name}",
|
||||
exc_info=e,
|
||||
)
|
||||
# raise
|
||||
@@ -185,7 +214,15 @@ class InputDocument(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
|
||||
self._backend = backend(
|
||||
self,
|
||||
path_or_stream=path_or_stream,
|
||||
options=self.backend_options,
|
||||
)
|
||||
else:
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
|
||||
if not self._backend.is_valid():
|
||||
self.valid = False
|
||||
|
||||
@@ -199,11 +236,11 @@ class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||
errors: List[ErrorItem] = [] # structure to keep errors
|
||||
errors: list[ErrorItem] = [] # structure to keep errors
|
||||
|
||||
pages: List[Page] = []
|
||||
pages: list[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
timings: dict[str, ProfilingItem] = {}
|
||||
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
||||
|
||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
@@ -222,7 +259,7 @@ class _DummyBackend(AbstractDocumentBackend):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return set()
|
||||
|
||||
@classmethod
|
||||
@@ -235,7 +272,7 @@ class _DummyBackend(AbstractDocumentBackend):
|
||||
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
headers: Optional[dict[str, str]] = None
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
||||
def docs(
|
||||
@@ -250,33 +287,36 @@ class _DocumentConversionInput(BaseModel):
|
||||
)
|
||||
format = self._guess_format(obj)
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
if format not in format_options.keys():
|
||||
backend_options: Optional[BackendOptions] = None
|
||||
if not format or format not in format_options:
|
||||
_log.error(
|
||||
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
|
||||
f"Input document {obj.name} with format {format} does not match "
|
||||
f"any allowed format: ({format_options.keys()})"
|
||||
)
|
||||
backend = _DummyBackend
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
options = format_options[format]
|
||||
backend = options.backend
|
||||
if "backend_options" in options.model_fields_set:
|
||||
backend_options = cast("FormatOption", options).backend_options
|
||||
|
||||
path_or_stream: Union[BytesIO, Path]
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj,
|
||||
format=format, # type: ignore[arg-type]
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
path_or_stream = obj
|
||||
elif isinstance(obj, DocumentStream):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
format=format, # type: ignore[arg-type]
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
path_or_stream = obj.stream
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||
|
||||
yield InputDocument(
|
||||
path_or_stream=path_or_stream,
|
||||
format=format, # type: ignore[arg-type]
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
backend_options=backend_options,
|
||||
)
|
||||
|
||||
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
||||
content = b"" # empty binary blob
|
||||
formats: list[InputFormat] = []
|
||||
@@ -290,12 +330,13 @@ class _DocumentConversionInput(BaseModel):
|
||||
with obj.open("rb") as f:
|
||||
content = f.read(1024) # Read first 1KB
|
||||
if mime is not None and mime.lower() == "application/zip":
|
||||
mime_root = "application/vnd.openxmlformats-officedocument"
|
||||
if obj.suffixes[-1].lower() == ".xlsx":
|
||||
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
mime = mime_root + ".spreadsheetml.sheet"
|
||||
elif obj.suffixes[-1].lower() == ".docx":
|
||||
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
mime = mime_root + ".wordprocessingml.document"
|
||||
elif obj.suffixes[-1].lower() == ".pptx":
|
||||
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
mime = mime_root + ".presentationml.presentation"
|
||||
|
||||
elif isinstance(obj, DocumentStream):
|
||||
content = obj.stream.read(8192)
|
||||
@@ -310,12 +351,13 @@ class _DocumentConversionInput(BaseModel):
|
||||
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
|
||||
if mime is not None and mime.lower() == "application/zip":
|
||||
objname = obj.name.lower()
|
||||
mime_root = "application/vnd.openxmlformats-officedocument"
|
||||
if objname.endswith(".xlsx"):
|
||||
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
mime = mime_root + ".spreadsheetml.sheet"
|
||||
elif objname.endswith(".docx"):
|
||||
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
mime = mime_root + ".wordprocessingml.document"
|
||||
elif objname.endswith(".pptx"):
|
||||
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
mime = mime_root + ".presentationml.presentation"
|
||||
|
||||
if mime is not None and mime.lower() == "application/gzip":
|
||||
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
|
||||
|
||||
@@ -9,11 +9,14 @@ from datetime import datetime
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
from pydantic import ConfigDict, model_validator, validate_call
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
)
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.csv_backend import CsvDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
@@ -28,6 +31,7 @@ from docling.backend.noop_backend import NoOpBackend
|
||||
from docling.backend.webvtt_backend import WebVTTDocumentBackend
|
||||
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
|
||||
from docling.datamodel.base_models import (
|
||||
BaseFormatOption,
|
||||
ConversionStatus,
|
||||
@@ -61,11 +65,13 @@ _PIPELINE_CACHE_LOCK = threading.Lock()
|
||||
|
||||
class FormatOption(BaseFormatOption):
|
||||
pipeline_cls: Type[BasePipeline]
|
||||
backend_options: Optional[BackendOptions] = None
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_optional_field_default(self) -> "FormatOption":
|
||||
def set_optional_field_default(self) -> Self:
|
||||
if self.pipeline_options is None:
|
||||
self.pipeline_options = self.pipeline_cls.get_default_options()
|
||||
|
||||
return self
|
||||
|
||||
|
||||
@@ -92,6 +98,7 @@ class PowerpointFormatOption(FormatOption):
|
||||
class MarkdownFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
||||
backend_options: HTMLBackendOptions = HTMLBackendOptions()
|
||||
|
||||
|
||||
class AsciiDocFormatOption(FormatOption):
|
||||
@@ -102,6 +109,7 @@ class AsciiDocFormatOption(FormatOption):
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
backend_options: HTMLBackendOptions = HTMLBackendOptions()
|
||||
|
||||
|
||||
class PatentUsptoFormatOption(FormatOption):
|
||||
@@ -150,7 +158,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
pipeline_cls=SimplePipeline,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=HTMLBackendOptions(),
|
||||
),
|
||||
InputFormat.XML_USPTO: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||
@@ -186,13 +196,13 @@ class DocumentConverter:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||
allowed_formats: Optional[list[InputFormat]] = None,
|
||||
format_options: Optional[dict[InputFormat, FormatOption]] = None,
|
||||
):
|
||||
self.allowed_formats = (
|
||||
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||
)
|
||||
self.format_to_options: Dict[InputFormat, FormatOption] = {
|
||||
self.format_to_options: dict[InputFormat, FormatOption] = {
|
||||
format: (
|
||||
_get_default_option(format=format)
|
||||
if (custom_option := (format_options or {}).get(format)) is None
|
||||
@@ -200,8 +210,8 @@ class DocumentConverter:
|
||||
)
|
||||
for format in self.allowed_formats
|
||||
}
|
||||
self.initialized_pipelines: Dict[
|
||||
Tuple[Type[BasePipeline], str], BasePipeline
|
||||
self.initialized_pipelines: dict[
|
||||
tuple[Type[BasePipeline], str], BasePipeline
|
||||
] = {}
|
||||
|
||||
def _get_initialized_pipelines(
|
||||
@@ -228,7 +238,7 @@ class DocumentConverter:
|
||||
def convert(
|
||||
self,
|
||||
source: Union[Path, str, DocumentStream], # TODO review naming
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
@@ -248,7 +258,7 @@ class DocumentConverter:
|
||||
def convert_all(
|
||||
self,
|
||||
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
|
||||
@@ -8,9 +8,10 @@ from collections.abc import Iterable, Iterator
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from pydantic import ConfigDict, model_validator, validate_call
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
@@ -56,7 +57,7 @@ class ExtractionFormatOption(BaseFormatOption):
|
||||
pipeline_cls: Type[BaseExtractionPipeline]
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_optional_field_default(self) -> "ExtractionFormatOption":
|
||||
def set_optional_field_default(self) -> Self:
|
||||
if self.pipeline_options is None:
|
||||
# `get_default_options` comes from BaseExtractionPipeline
|
||||
self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
|
||||
@@ -70,7 +71,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
|
||||
the VLM extractor. This duplication will be removed when we deduplicate
|
||||
the format registry between convert/extract.
|
||||
"""
|
||||
format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
InputFormat.IMAGE: PyPdfiumDocumentBackend,
|
||||
InputFormat.PDF: PyPdfiumDocumentBackend,
|
||||
}
|
||||
@@ -98,24 +99,24 @@ class DocumentExtractor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
allowed_formats: Optional[list[InputFormat]] = None,
|
||||
extraction_format_options: Optional[
|
||||
Dict[InputFormat, ExtractionFormatOption]
|
||||
dict[InputFormat, ExtractionFormatOption]
|
||||
] = None,
|
||||
) -> None:
|
||||
self.allowed_formats: List[InputFormat] = (
|
||||
self.allowed_formats: list[InputFormat] = (
|
||||
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||
)
|
||||
# Build per-format options with defaults, then apply any user overrides
|
||||
overrides = extraction_format_options or {}
|
||||
self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
|
||||
self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
|
||||
fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
|
||||
for fmt in self.allowed_formats
|
||||
}
|
||||
|
||||
# Cache pipelines by (class, options-hash)
|
||||
self._initialized_pipelines: Dict[
|
||||
Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
|
||||
self._initialized_pipelines: dict[
|
||||
tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
|
||||
] = {}
|
||||
|
||||
# ---------------------------- Public API ---------------------------------
|
||||
@@ -125,7 +126,7 @@ class DocumentExtractor:
|
||||
self,
|
||||
source: Union[Path, str, DocumentStream],
|
||||
template: ExtractionTemplateType,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
@@ -147,7 +148,7 @@ class DocumentExtractor:
|
||||
self,
|
||||
source: Iterable[Union[Path, str, DocumentStream]],
|
||||
template: ExtractionTemplateType,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@@ -48,8 +47,8 @@ class ReadingOrderModel:
|
||||
|
||||
def _assembled_to_readingorder_elements(
|
||||
self, conv_res: ConversionResult
|
||||
) -> List[ReadingOrderPageElement]:
|
||||
elements: List[ReadingOrderPageElement] = []
|
||||
) -> list[ReadingOrderPageElement]:
|
||||
elements: list[ReadingOrderPageElement] = []
|
||||
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for element in conv_res.assembled.elements:
|
||||
@@ -123,10 +122,10 @@ class ReadingOrderModel:
|
||||
def _readingorder_elements_to_docling_doc(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
ro_elements: List[ReadingOrderPageElement],
|
||||
el_to_captions_mapping: Dict[int, List[int]],
|
||||
el_to_footnotes_mapping: Dict[int, List[int]],
|
||||
el_merges_mapping: Dict[int, List[int]],
|
||||
ro_elements: list[ReadingOrderPageElement],
|
||||
el_to_captions_mapping: dict[int, list[int]],
|
||||
el_to_footnotes_mapping: dict[int, list[int]],
|
||||
el_merges_mapping: dict[int, list[int]],
|
||||
) -> DoclingDocument:
|
||||
id_to_elem = {
|
||||
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
||||
|
||||
@@ -2,7 +2,7 @@ import base64
|
||||
import json
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
@@ -19,7 +19,7 @@ def api_image_request(
|
||||
prompt: str,
|
||||
url: AnyUrl,
|
||||
timeout: float = 20,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
**params,
|
||||
) -> str:
|
||||
img_io = BytesIO()
|
||||
@@ -69,8 +69,8 @@ def api_image_request_streaming(
|
||||
url: AnyUrl,
|
||||
*,
|
||||
timeout: float = 20,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
generation_stoppers: List[GenerationStopper] = [],
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
generation_stoppers: list[GenerationStopper] = [],
|
||||
**params,
|
||||
) -> str:
|
||||
"""
|
||||
|
||||
@@ -2,7 +2,6 @@ import bisect
|
||||
import logging
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, Size
|
||||
from docling_core.types.doc.page import TextCell
|
||||
@@ -39,7 +38,7 @@ class UnionFind:
|
||||
self.parent[root_y] = root_x
|
||||
self.rank[root_x] += 1
|
||||
|
||||
def get_groups(self) -> Dict[int, List[int]]:
|
||||
def get_groups(self) -> dict[int, list[int]]:
|
||||
"""Returns groups as {root: [elements]}."""
|
||||
groups = defaultdict(list)
|
||||
for elem in self.parent:
|
||||
@@ -50,13 +49,13 @@ class UnionFind:
|
||||
class SpatialClusterIndex:
|
||||
"""Efficient spatial indexing for clusters using R-tree and interval trees."""
|
||||
|
||||
def __init__(self, clusters: List[Cluster]):
|
||||
def __init__(self, clusters: list[Cluster]):
|
||||
p = index.Property()
|
||||
p.dimension = 2
|
||||
self.spatial_index = index.Index(properties=p)
|
||||
self.x_intervals = IntervalTree()
|
||||
self.y_intervals = IntervalTree()
|
||||
self.clusters_by_id: Dict[int, Cluster] = {}
|
||||
self.clusters_by_id: dict[int, Cluster] = {}
|
||||
|
||||
for cluster in clusters:
|
||||
self.add_cluster(cluster)
|
||||
@@ -72,7 +71,7 @@ class SpatialClusterIndex:
|
||||
self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
|
||||
del self.clusters_by_id[cluster.id]
|
||||
|
||||
def find_candidates(self, bbox: BoundingBox) -> Set[int]:
|
||||
def find_candidates(self, bbox: BoundingBox) -> set[int]:
|
||||
"""Find potential overlapping cluster IDs using all indexes."""
|
||||
spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
|
||||
x_candidates = self.x_intervals.find_containing(
|
||||
@@ -123,13 +122,13 @@ class IntervalTree:
|
||||
"""Memory-efficient interval tree for 1D overlap queries."""
|
||||
|
||||
def __init__(self):
|
||||
self.intervals: List[Interval] = [] # Sorted by min_val
|
||||
self.intervals: list[Interval] = [] # Sorted by min_val
|
||||
|
||||
def insert(self, min_val: float, max_val: float, id: int):
|
||||
interval = Interval(min_val, max_val, id)
|
||||
bisect.insort(self.intervals, interval)
|
||||
|
||||
def find_containing(self, point: float) -> Set[int]:
|
||||
def find_containing(self, point: float) -> set[int]:
|
||||
"""Find all intervals containing the point."""
|
||||
pos = bisect.bisect_left(self.intervals, point)
|
||||
result = set()
|
||||
@@ -196,7 +195,7 @@ class LayoutPostprocessor:
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self, page: Page, clusters: List[Cluster], options: LayoutOptions
|
||||
self, page: Page, clusters: list[Cluster], options: LayoutOptions
|
||||
) -> None:
|
||||
"""Initialize processor with page and clusters."""
|
||||
|
||||
@@ -219,7 +218,7 @@ class LayoutPostprocessor:
|
||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||
)
|
||||
|
||||
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||
def postprocess(self) -> tuple[list[Cluster], list[TextCell]]:
|
||||
"""Main processing pipeline."""
|
||||
self.regular_clusters = self._process_regular_clusters()
|
||||
self.special_clusters = self._process_special_clusters()
|
||||
@@ -254,7 +253,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return final_clusters, self.cells
|
||||
|
||||
def _process_regular_clusters(self) -> List[Cluster]:
|
||||
def _process_regular_clusters(self) -> list[Cluster]:
|
||||
"""Process regular clusters with iterative refinement."""
|
||||
clusters = [
|
||||
c
|
||||
@@ -311,7 +310,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _process_special_clusters(self) -> List[Cluster]:
|
||||
def _process_special_clusters(self) -> list[Cluster]:
|
||||
special_clusters = [
|
||||
c
|
||||
for c in self.special_clusters
|
||||
@@ -381,7 +380,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return picture_clusters + wrapper_clusters
|
||||
|
||||
def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
|
||||
def _handle_cross_type_overlaps(self, special_clusters) -> list[Cluster]:
|
||||
"""Handle overlaps between regular and wrapper clusters before child assignment.
|
||||
|
||||
In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
|
||||
@@ -454,7 +453,7 @@ class LayoutPostprocessor:
|
||||
|
||||
def _select_best_cluster_from_group(
|
||||
self,
|
||||
group_clusters: List[Cluster],
|
||||
group_clusters: list[Cluster],
|
||||
params: dict,
|
||||
) -> Cluster:
|
||||
"""Select best cluster from a group of overlapping clusters based on all rules."""
|
||||
@@ -487,11 +486,11 @@ class LayoutPostprocessor:
|
||||
|
||||
def _remove_overlapping_clusters(
|
||||
self,
|
||||
clusters: List[Cluster],
|
||||
clusters: list[Cluster],
|
||||
cluster_type: str,
|
||||
overlap_threshold: float = 0.8,
|
||||
containment_threshold: float = 0.8,
|
||||
) -> List[Cluster]:
|
||||
) -> list[Cluster]:
|
||||
if not clusters:
|
||||
return []
|
||||
|
||||
@@ -544,7 +543,7 @@ class LayoutPostprocessor:
|
||||
|
||||
def _select_best_cluster(
|
||||
self,
|
||||
clusters: List[Cluster],
|
||||
clusters: list[Cluster],
|
||||
area_threshold: float,
|
||||
conf_threshold: float,
|
||||
) -> Cluster:
|
||||
@@ -572,7 +571,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return current_best if current_best else clusters[0]
|
||||
|
||||
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
def _deduplicate_cells(self, cells: list[TextCell]) -> list[TextCell]:
|
||||
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
||||
seen_ids = set()
|
||||
unique_cells = []
|
||||
@@ -583,8 +582,8 @@ class LayoutPostprocessor:
|
||||
return unique_cells
|
||||
|
||||
def _assign_cells_to_clusters(
|
||||
self, clusters: List[Cluster], min_overlap: float = 0.2
|
||||
) -> List[Cluster]:
|
||||
self, clusters: list[Cluster], min_overlap: float = 0.2
|
||||
) -> list[Cluster]:
|
||||
"""Assign cells to best overlapping cluster."""
|
||||
for cluster in clusters:
|
||||
cluster.cells = []
|
||||
@@ -616,7 +615,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
|
||||
def _find_unassigned_cells(self, clusters: list[Cluster]) -> list[TextCell]:
|
||||
"""Find cells not assigned to any cluster."""
|
||||
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
|
||||
return [
|
||||
@@ -625,7 +624,7 @@ class LayoutPostprocessor:
|
||||
if cell.index not in assigned and cell.text.strip()
|
||||
]
|
||||
|
||||
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
||||
def _adjust_cluster_bboxes(self, clusters: list[Cluster]) -> list[Cluster]:
|
||||
"""Adjust cluster bounding boxes to contain their cells."""
|
||||
for cluster in clusters:
|
||||
if not cluster.cells:
|
||||
@@ -651,13 +650,13 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
def _sort_cells(self, cells: list[TextCell]) -> list[TextCell]:
|
||||
"""Sort cells in native reading order."""
|
||||
return sorted(cells, key=lambda c: (c.index))
|
||||
|
||||
def _sort_clusters(
|
||||
self, clusters: List[Cluster], mode: str = "id"
|
||||
) -> List[Cluster]:
|
||||
self, clusters: list[Cluster], mode: str = "id"
|
||||
) -> list[Cluster]:
|
||||
"""Sort clusters in reading order (top-to-bottom, left-to-right)."""
|
||||
if mode == "id": # sort in the order the cells are printed in the PDF.
|
||||
return sorted(
|
||||
|
||||
Reference in New Issue
Block a user