feat(backend): add generic options support and HTML image handling modes (#2011)

* feat: add backend options support to document backends

Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* feat: enhance document backends with generic backend options and improve HTML image handling

Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* Refactor tests for declarativebackend

Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(HTML): improve image caption handling and ensure backend options are set correctly

Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix: enhance HTML backend image handling and add support for local file paths

Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore: Add ground truth data for test data

Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(HTML): skip loading SVG files in image data handling

Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* refactor(html): simplify backend options and address gaps

Backend options for DeclarativeDocumentBackend classes and only when necessary.
Refactor caption parsing in 'img' elements and remove dummy text.
Replace deprecated annotations from Typing library with native types.
Replace typing annotations according to pydantic guidelines.
Some documentation with pydantic annotations.
Fix diff issue with test files.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* tests(html): add tests and fix bugs

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* refactor(html): refactor backend options

Move backend option classes to its own module within datamodel package.
Rename 'source_location' with 'source_uri' in HTMLBackendOptions.
Rename 'image_fetch' with 'fetch_images' in HTMLBackendOptions.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* refactor(markdown): create a class for the markdown backend options

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Legoshi
2025-10-21 12:52:17 +02:00
committed by GitHub
parent b66624bfff
commit a30e6a7614
31 changed files with 8088 additions and 7588 deletions

View File

@@ -1,10 +1,12 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Set, Union
from typing import TYPE_CHECKING, Union
from docling_core.types.doc import DoclingDocument
from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@@ -35,7 +37,7 @@ class AbstractDocumentBackend(ABC):
@classmethod
@abstractmethod
def supported_formats(cls) -> Set["InputFormat"]:
def supported_formats(cls) -> set["InputFormat"]:
pass
@@ -58,6 +60,20 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
straight without a recognition pipeline.
"""
@abstractmethod
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
options: BackendOptions = DeclarativeBackendOptions(),
) -> None:
super().__init__(in_doc, path_or_stream)
self.options: BackendOptions = options
@abstractmethod
def convert(self) -> DoclingDocument:
pass
@classmethod
def get_default_options(cls) -> BackendOptions:
return DeclarativeBackendOptions()

View File

@@ -2,7 +2,7 @@ import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Final, Set, Union
from typing import Final, Union
from docling_core.types.doc import (
DocItemLabel,
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128
class AsciiDocBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.ASCIIDOC}
def convert(self) -> DoclingDocument:

View File

@@ -1,13 +1,16 @@
import base64
import logging
import os
import re
import traceback
import warnings
from contextlib import contextmanager
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString
from docling_core.types.doc import (
@@ -17,6 +20,7 @@ from docling_core.types.doc import (
DocumentOrigin,
GroupItem,
GroupLabel,
PictureItem,
RefItem,
RichTableCell,
TableCell,
@@ -24,13 +28,18 @@ from docling_core.types.doc import (
TableItem,
TextItem,
)
from docling_core.types.doc.document import ContentLayer, Formatting, Script
from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
from PIL import Image, UnidentifiedImageError
from pydantic import AnyUrl, BaseModel, ValidationError
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
)
from docling.datamodel.backend_options import HTMLBackendOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docling.exceptions import OperationNotAllowed
_log = logging.getLogger(__name__)
@@ -43,6 +52,7 @@ _BLOCK_TAGS: Final = {
"details",
"figure",
"footer",
"img",
"h1",
"h2",
"h3",
@@ -186,11 +196,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
original_url: Optional[AnyUrl] = None,
options: HTMLBackendOptions = HTMLBackendOptions(),
):
super().__init__(in_doc, path_or_stream)
super().__init__(in_doc, path_or_stream, options)
self.soup: Optional[Tag] = None
self.path_or_stream = path_or_stream
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
self.base_path: Optional[str] = str(options.source_uri)
# Initialize the parents for the hierarchy
self.max_levels = 10
@@ -200,7 +211,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for i in range(self.max_levels):
self.parents[i] = None
self.hyperlink: Union[AnyUrl, Path, None] = None
self.original_url = original_url
self.format_tags: list[str] = []
try:
@@ -236,6 +246,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.HTML}
@classmethod
@override
def get_default_options(cls) -> HTMLBackendOptions:
return HTMLBackendOptions()
@override
def convert(self) -> DoclingDocument:
_log.debug("Starting HTML conversion...")
@@ -261,7 +276,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
content_layer=ContentLayer.FURNITURE,
)
# remove script and style tags
for tag in self.soup(["script", "style"]):
for tag in self.soup(["script", "noscript", "style"]):
tag.decompose()
# remove any hidden tag
for tag in self.soup(hidden=True):
@@ -291,6 +306,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self._walk(content, doc)
return doc
@staticmethod
def _is_remote_url(value: str) -> bool:
parsed = urlparse(value)
return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
def _resolve_relative_path(self, loc: str) -> str:
abs_loc = loc
if self.base_path:
if loc.startswith("//"):
# Protocol-relative URL - default to https
abs_loc = "https:" + loc
elif not loc.startswith(("http://", "https://", "data:", "file://")):
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
abs_loc = urljoin(self.base_path, loc)
elif self.base_path: # local fetch
# For local files, resolve relative to the HTML file location
abs_loc = str(Path(self.base_path).parent / loc)
_log.debug(f"Resolved location {loc} to {abs_loc}")
return abs_loc
@staticmethod
def group_cell_elements(
group_name: str,
@@ -520,6 +557,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if name == "img":
flush_buffer()
im_ref3 = self._emit_image(node, doc)
if im_ref3:
added_refs.append(im_ref3)
elif name in _FORMAT_TAG_MAP:
with self._use_format([name]):
@@ -669,8 +707,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
else:
if isinstance(this_href, str) and this_href:
old_hyperlink = self.hyperlink
if self.original_url is not None:
this_href = urljoin(str(self.original_url), str(this_href))
this_href = self._resolve_relative_path(this_href)
# ugly fix for relative links since pydantic does not support them.
try:
new_hyperlink = AnyUrl(this_href)
@@ -837,6 +874,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
im_ref = self._emit_image(img_tag, doc)
if im_ref:
added_ref.append(im_ref)
return added_ref
@@ -1003,6 +1041,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
img_tag = tag.find("img")
if isinstance(img_tag, Tag):
im_ref = self._emit_image(img_tag, doc)
if im_ref is not None:
added_refs.append(im_ref)
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
@@ -1061,6 +1100,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for img_tag in tag("img"):
if isinstance(img_tag, Tag):
im_ref2 = self._emit_image(tag, doc)
if im_ref2 is not None:
added_refs.append(im_ref2)
elif tag_name in {"pre"}:
@@ -1092,10 +1132,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self._walk(tag, doc)
return added_refs
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
figure = img_tag.find_parent("figure")
caption: AnnotatedTextList = AnnotatedTextList()
parent = self.parents[self.level]
# check if the figure has a link - this is HACK:
def get_img_hyperlink(img_tag):
this_parent = img_tag.parent
@@ -1106,9 +1148,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return None
if img_hyperlink := get_img_hyperlink(img_tag):
caption.append(
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
)
img_text = img_tag.get("alt") or ""
caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
if isinstance(figure, Tag):
caption_tag = figure.find("figcaption", recursive=False)
@@ -1135,13 +1176,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
hyperlink=caption_anno_text.hyperlink,
)
docling_pic = doc.add_picture(
src_loc: str = self._get_attr_as_string(img_tag, "src")
if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
# Do not fetch the image, just add a placeholder
placeholder: PictureItem = doc.add_picture(
caption=caption_item,
parent=self.parents[self.level],
parent=parent,
content_layer=self.content_layer,
)
return placeholder.get_ref()
src_loc = self._resolve_relative_path(src_loc)
img_ref = self._create_image_ref(src_loc)
docling_pic = doc.add_picture(
image=img_ref,
caption=caption_item,
parent=parent,
content_layer=self.content_layer,
)
return docling_pic.get_ref()
def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
try:
img_data = self._load_image_data(src_url)
if img_data:
img = Image.open(BytesIO(img_data))
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
except (
requests.HTTPError,
ValidationError,
UnidentifiedImageError,
OperationNotAllowed,
TypeError,
ValueError,
) as e:
warnings.warn(f"Could not process an image from {src_url}: {e}")
return None
def _load_image_data(self, src_loc: str) -> Optional[bytes]:
if src_loc.lower().endswith(".svg"):
_log.debug(f"Skipping SVG file: {src_loc}")
return None
if HTMLDocumentBackend._is_remote_url(src_loc):
if not self.options.enable_remote_fetch:
raise OperationNotAllowed(
"Fetching remote resources is only allowed when set explicitly. "
"Set options.enable_remote_fetch=True."
)
response = requests.get(src_loc, stream=True)
response.raise_for_status()
return response.content
elif src_loc.startswith("data:"):
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
return base64.b64decode(data)
if src_loc.startswith("file://"):
src_loc = src_loc[7:]
if not self.options.enable_local_fetch:
raise OperationNotAllowed(
"Fetching local resources is only allowed when set explicitly. "
"Set options.enable_local_fetch=True."
)
# add check that file exists and can read
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
with open(src_loc, "rb") as f:
return f.read()
else:
raise ValueError("File does not exist or it is not readable.")
@staticmethod
def get_text(item: PageElement) -> str:
"""Concatenate all child strings of a PageElement.
@@ -1238,3 +1344,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
)
return int_spans
@staticmethod
def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
"""Get attribute value as string, handling list values."""
value = tag.get(attr)
if not value:
return default
return value[0] if isinstance(value, list) else value

View File

@@ -24,10 +24,16 @@ from docling_core.types.doc import (
from docling_core.types.doc.document import Formatting
from marko import Markdown
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
from typing_extensions import Annotated
from typing_extensions import Annotated, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
)
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.backend_options import (
HTMLBackendOptions,
MarkdownBackendOptions,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@@ -88,8 +94,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
return shortened_text
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
@override
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
options: MarkdownBackendOptions = MarkdownBackendOptions(),
):
super().__init__(in_doc, path_or_stream, options)
_log.debug("Starting MarkdownDocumentBackend...")
@@ -580,9 +592,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
format=InputFormat.HTML,
backend=html_backend_cls,
filename=self.file.name,
backend_options=self.options,
)
html_backend_obj = html_backend_cls(
in_doc=in_doc, path_or_stream=stream
in_doc=in_doc,
path_or_stream=stream,
options=cast(HTMLBackendOptions, self.options),
)
doc = html_backend_obj.convert()
else:

View File

@@ -1,7 +1,7 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from typing import Union
from docling_core.types.doc import (
BoundingBox,
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.PPTX}
def convert(self) -> DoclingDocument:

View File

@@ -3,7 +3,7 @@ import re
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import Any, Callable, List, Optional, Union
from typing import Any, Callable, Optional, Union
from docling_core.types.doc import (
DocItemLabel,
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.numbered_headers: dict[int, int] = {}
self.equation_bookends: str = "<eq>{EQ}</eq>"
# Track processed textbox elements to avoid duplication
self.processed_textbox_elements: List[int] = []
self.processed_textbox_elements: list[int] = []
self.docx_to_pdf_converter: Optional[Callable] = None
self.docx_to_pdf_converter_init = False
self.display_drawingml_warning = True
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
textbox_elements: list,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> List[RefItem]:
elem_ref: List[RefItem] = []
) -> list[RefItem]:
elem_ref: list[RefItem] = []
"""Process textbox content and add it to the document structure."""
level = self._get_level()
# Create a textbox group to contain all text from the textbox
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> List[RefItem]:
elem_ref: List[RefItem] = []
) -> list[RefItem]:
elem_ref: list[RefItem] = []
paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text(
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
curr_level: Optional[int],
text: str,
is_numbered_style: bool = False,
) -> List[RefItem]:
elem_ref: List[RefItem] = []
) -> list[RefItem]:
elem_ref: list[RefItem] = []
level = self._get_level()
if isinstance(curr_level, int):
if curr_level > level:
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
marker: str,
enumerated: bool,
level: int,
) -> List[RefItem]:
elem_ref: List[RefItem] = []
) -> list[RefItem]:
elem_ref: list[RefItem] = []
# This should not happen by construction
if not isinstance(self.parents[level], ListGroup):
return elem_ref
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
ilevel: int,
elements: list,
is_numbered: bool = False,
) -> List[RefItem]:
elem_ref: List[RefItem] = []
) -> list[RefItem]:
elem_ref: list[RefItem] = []
# this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return elem_ref
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> List[RefItem]:
elem_ref: List[RefItem] = []
) -> list[RefItem]:
elem_ref: list[RefItem] = []
table: Table = Table(element, docx_obj)
num_rows = len(table.rows)
num_cols = len(table.columns)
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
text = text.replace("<eq>", "$").replace("</eq>", "$")
provs_in_cell: List[RefItem] = []
provs_in_cell: list[RefItem] = []
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
ref_for_rich_cell = provs_in_cell[0]
rich_table_cell = False
def group_cell_elements(
group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
) -> RefItem:
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> List[RefItem]:
) -> list[RefItem]:
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
image_data: Optional[bytes] = None
rId = drawing_blip[0].get(
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image_data = image_part.blob # Get the binary image data
return image_data
elem_ref: List[RefItem] = []
elem_ref: list[RefItem] = []
level = self._get_level()
# Open the BytesIO object with PIL to create an Image
image_data: Optional[bytes] = get_docx_image(drawing_blip)

View File

@@ -0,0 +1,53 @@
from pathlib import PurePath
from typing import Annotated, Literal, Optional, Union
from pydantic import AnyUrl, BaseModel, Field
class BaseBackendOptions(BaseModel):
"""Common options for all declarative document backends."""
enable_remote_fetch: bool = Field(
False, description="Enable remote resource fetching."
)
enable_local_fetch: bool = Field(
False, description="Enable local resource fetching."
)
class DeclarativeBackendOptions(BaseBackendOptions):
"""Default backend options for a declarative document backend."""
kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
class HTMLBackendOptions(BaseBackendOptions):
"""Options specific to the HTML backend.
This class can be extended to include options specific to HTML processing.
"""
kind: Literal["html"] = Field("html", exclude=True, repr=False)
fetch_images: bool = Field(
False,
description=(
"Whether the backend should access remote or local resources to parse "
"images in an HTML document."
),
)
source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
None,
description=(
"The URI that originates the HTML document. If provided, the backend "
"will use it to resolve relative paths in the HTML document."
),
)
class MarkdownBackendOptions(HTMLBackendOptions):
"""Options specific to the Markdown backend."""
BackendOptions = Annotated[
Union[DeclarativeBackendOptions, HTMLBackendOptions], Field(discriminator="kind")
]

View File

@@ -8,14 +8,12 @@ from io import BytesIO
from pathlib import Path, PurePath
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
Annotated,
Literal,
Optional,
Set,
Type,
Union,
cast,
)
import filetype
@@ -54,8 +52,10 @@ from typing_extensions import deprecated
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.backend_options import BackendOptions
from docling.datamodel.base_models import (
AssembledUnit,
ConfidenceReport,
@@ -74,6 +74,7 @@ from docling.utils.utils import create_file_hash
if TYPE_CHECKING:
from docling.datamodel.base_models import BaseFormatOption
from docling.document_converter import FormatOption
_log = logging.getLogger(__name__)
@@ -102,32 +103,58 @@ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
class InputDocument(BaseModel):
file: PurePath
document_hash: str # = None
valid: bool = True
limits: DocumentLimits = DocumentLimits()
format: InputFormat # = None
"""A document as an input of a Docling conversion."""
filesize: Optional[int] = None
page_count: int = 0
file: Annotated[
PurePath, Field(description="A path representation the input document.")
]
document_hash: Annotated[
str,
Field(description="A stable hash of the path or stream of the input document."),
]
valid: bool = Field(True, description="Whether this is is a valid input document.")
backend_options: Optional[BackendOptions] = Field(
None, description="Custom options for declarative backends."
)
limits: DocumentLimits = Field(
DocumentLimits(), description="Limits in the input document for the conversion."
)
format: Annotated[InputFormat, Field(description="The document format.")]
_backend: AbstractDocumentBackend # Internal PDF backend used
filesize: Optional[int] = Field(
None, description="Size of the input file, in bytes."
)
page_count: int = Field(0, description="Number of pages in the input document.")
_backend: AbstractDocumentBackend
def __init__(
self,
path_or_stream: Union[BytesIO, Path],
format: InputFormat,
backend: Type[AbstractDocumentBackend],
backend_options: Optional[BackendOptions] = None,
filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None,
):
) -> None:
super().__init__(
file="", document_hash="", format=InputFormat.PDF
file="",
document_hash="",
format=InputFormat.PDF,
backend_options=backend_options,
) # initialize with dummy values
self.limits = limits or DocumentLimits()
self.format = format
# check for backend incompatibilities
if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
if not issubclass(
type(backend_options), type(backend.get_default_options())
):
raise ValueError(
"Incompatible types between backend and backend_options arguments."
)
try:
if isinstance(path_or_stream, Path):
self.file = path_or_stream
@@ -140,7 +167,8 @@ class InputDocument(BaseModel):
elif isinstance(path_or_stream, BytesIO):
assert filename is not None, (
"Can't construct InputDocument from stream without providing filename arg."
"Can't construct InputDocument from stream without providing "
"filename arg."
)
self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes
@@ -175,7 +203,8 @@ class InputDocument(BaseModel):
except RuntimeError as e:
self.valid = False
_log.exception(
f"An unexpected error occurred while opening the document {self.file.name}",
"An unexpected error occurred while opening the document "
"f{self.file.name}",
exc_info=e,
)
# raise
@@ -185,7 +214,15 @@ class InputDocument(BaseModel):
backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path],
) -> None:
if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
self._backend = backend(
self,
path_or_stream=path_or_stream,
options=self.backend_options,
)
else:
self._backend = backend(self, path_or_stream=path_or_stream)
if not self._backend.is_valid():
self.valid = False
@@ -199,11 +236,11 @@ class ConversionResult(BaseModel):
input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success
errors: List[ErrorItem] = [] # structure to keep errors
errors: list[ErrorItem] = [] # structure to keep errors
pages: List[Page] = []
pages: list[Page] = []
assembled: AssembledUnit = AssembledUnit()
timings: Dict[str, ProfilingItem] = {}
timings: dict[str, ProfilingItem] = {}
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
document: DoclingDocument = _EMPTY_DOCLING_DOC
@@ -222,7 +259,7 @@ class _DummyBackend(AbstractDocumentBackend):
return False
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
def supported_formats(cls) -> set[InputFormat]:
return set()
@classmethod
@@ -235,7 +272,7 @@ class _DummyBackend(AbstractDocumentBackend):
class _DocumentConversionInput(BaseModel):
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
headers: Optional[Dict[str, str]] = None
headers: Optional[dict[str, str]] = None
limits: Optional[DocumentLimits] = DocumentLimits()
def docs(
@@ -250,33 +287,36 @@ class _DocumentConversionInput(BaseModel):
)
format = self._guess_format(obj)
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys():
backend_options: Optional[BackendOptions] = None
if not format or format not in format_options:
_log.error(
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
f"Input document {obj.name} with format {format} does not match "
f"any allowed format: ({format_options.keys()})"
)
backend = _DummyBackend
else:
backend = format_options[format].backend
options = format_options[format]
backend = options.backend
if "backend_options" in options.model_fields_set:
backend_options = cast("FormatOption", options).backend_options
path_or_stream: Union[BytesIO, Path]
if isinstance(obj, Path):
yield InputDocument(
path_or_stream=obj,
format=format, # type: ignore[arg-type]
filename=obj.name,
limits=self.limits,
backend=backend,
)
path_or_stream = obj
elif isinstance(obj, DocumentStream):
yield InputDocument(
path_or_stream=obj.stream,
format=format, # type: ignore[arg-type]
filename=obj.name,
limits=self.limits,
backend=backend,
)
path_or_stream = obj.stream
else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
yield InputDocument(
path_or_stream=path_or_stream,
format=format, # type: ignore[arg-type]
filename=obj.name,
limits=self.limits,
backend=backend,
backend_options=backend_options,
)
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
content = b"" # empty binary blob
formats: list[InputFormat] = []
@@ -290,12 +330,13 @@ class _DocumentConversionInput(BaseModel):
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
if mime is not None and mime.lower() == "application/zip":
mime_root = "application/vnd.openxmlformats-officedocument"
if obj.suffixes[-1].lower() == ".xlsx":
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
mime = mime_root + ".spreadsheetml.sheet"
elif obj.suffixes[-1].lower() == ".docx":
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
mime = mime_root + ".wordprocessingml.document"
elif obj.suffixes[-1].lower() == ".pptx":
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
mime = mime_root + ".presentationml.presentation"
elif isinstance(obj, DocumentStream):
content = obj.stream.read(8192)
@@ -310,12 +351,13 @@ class _DocumentConversionInput(BaseModel):
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
if mime is not None and mime.lower() == "application/zip":
objname = obj.name.lower()
mime_root = "application/vnd.openxmlformats-officedocument"
if objname.endswith(".xlsx"):
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
mime = mime_root + ".spreadsheetml.sheet"
elif objname.endswith(".docx"):
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
mime = mime_root + ".wordprocessingml.document"
elif objname.endswith(".pptx"):
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
mime = mime_root + ".presentationml.presentation"
if mime is not None and mime.lower() == "application/gzip":
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):

View File

@@ -9,11 +9,14 @@ from datetime import datetime
from functools import partial
from io import BytesIO
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Type, Union
from typing import Optional, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from pydantic import ConfigDict, model_validator, validate_call
from typing_extensions import Self
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
)
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@@ -28,6 +31,7 @@ from docling.backend.noop_backend import NoOpBackend
from docling.backend.webvtt_backend import WebVTTDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
from docling.datamodel.base_models import (
BaseFormatOption,
ConversionStatus,
@@ -61,11 +65,13 @@ _PIPELINE_CACHE_LOCK = threading.Lock()
class FormatOption(BaseFormatOption):
pipeline_cls: Type[BasePipeline]
backend_options: Optional[BackendOptions] = None
@model_validator(mode="after")
def set_optional_field_default(self) -> "FormatOption":
def set_optional_field_default(self) -> Self:
if self.pipeline_options is None:
self.pipeline_options = self.pipeline_cls.get_default_options()
return self
@@ -92,6 +98,7 @@ class PowerpointFormatOption(FormatOption):
class MarkdownFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
backend_options: HTMLBackendOptions = HTMLBackendOptions()
class AsciiDocFormatOption(FormatOption):
@@ -102,6 +109,7 @@ class AsciiDocFormatOption(FormatOption):
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
backend_options: HTMLBackendOptions = HTMLBackendOptions()
class PatentUsptoFormatOption(FormatOption):
@@ -150,7 +158,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
pipeline_cls=SimplePipeline,
backend=HTMLDocumentBackend,
backend_options=HTMLBackendOptions(),
),
InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
@@ -186,13 +196,13 @@ class DocumentConverter:
def __init__(
self,
allowed_formats: Optional[List[InputFormat]] = None,
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
allowed_formats: Optional[list[InputFormat]] = None,
format_options: Optional[dict[InputFormat, FormatOption]] = None,
):
self.allowed_formats = (
allowed_formats if allowed_formats is not None else list(InputFormat)
)
self.format_to_options: Dict[InputFormat, FormatOption] = {
self.format_to_options: dict[InputFormat, FormatOption] = {
format: (
_get_default_option(format=format)
if (custom_option := (format_options or {}).get(format)) is None
@@ -200,8 +210,8 @@ class DocumentConverter:
)
for format in self.allowed_formats
}
self.initialized_pipelines: Dict[
Tuple[Type[BasePipeline], str], BasePipeline
self.initialized_pipelines: dict[
tuple[Type[BasePipeline], str], BasePipeline
] = {}
def _get_initialized_pipelines(
@@ -228,7 +238,7 @@ class DocumentConverter:
def convert(
self,
source: Union[Path, str, DocumentStream], # TODO review naming
headers: Optional[Dict[str, str]] = None,
headers: Optional[dict[str, str]] = None,
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
@@ -248,7 +258,7 @@ class DocumentConverter:
def convert_all(
self,
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
headers: Optional[Dict[str, str]] = None,
headers: Optional[dict[str, str]] = None,
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,

View File

@@ -8,9 +8,10 @@ from collections.abc import Iterable, Iterator
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Type, Union
from typing import Optional, Type, Union
from pydantic import ConfigDict, model_validator, validate_call
from typing_extensions import Self
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -56,7 +57,7 @@ class ExtractionFormatOption(BaseFormatOption):
pipeline_cls: Type[BaseExtractionPipeline]
@model_validator(mode="after")
def set_optional_field_default(self) -> "ExtractionFormatOption":
def set_optional_field_default(self) -> Self:
if self.pipeline_options is None:
# `get_default_options` comes from BaseExtractionPipeline
self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
@@ -70,7 +71,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
the VLM extractor. This duplication will be removed when we deduplicate
the format registry between convert/extract.
"""
format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.IMAGE: PyPdfiumDocumentBackend,
InputFormat.PDF: PyPdfiumDocumentBackend,
}
@@ -98,24 +99,24 @@ class DocumentExtractor:
def __init__(
self,
allowed_formats: Optional[List[InputFormat]] = None,
allowed_formats: Optional[list[InputFormat]] = None,
extraction_format_options: Optional[
Dict[InputFormat, ExtractionFormatOption]
dict[InputFormat, ExtractionFormatOption]
] = None,
) -> None:
self.allowed_formats: List[InputFormat] = (
self.allowed_formats: list[InputFormat] = (
allowed_formats if allowed_formats is not None else list(InputFormat)
)
# Build per-format options with defaults, then apply any user overrides
overrides = extraction_format_options or {}
self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
for fmt in self.allowed_formats
}
# Cache pipelines by (class, options-hash)
self._initialized_pipelines: Dict[
Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
self._initialized_pipelines: dict[
tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
] = {}
# ---------------------------- Public API ---------------------------------
@@ -125,7 +126,7 @@ class DocumentExtractor:
self,
source: Union[Path, str, DocumentStream],
template: ExtractionTemplateType,
headers: Optional[Dict[str, str]] = None,
headers: Optional[dict[str, str]] = None,
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
@@ -147,7 +148,7 @@ class DocumentExtractor:
self,
source: Iterable[Union[Path, str, DocumentStream]],
template: ExtractionTemplateType,
headers: Optional[Dict[str, str]] = None,
headers: Optional[dict[str, str]] = None,
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,

View File

@@ -1,5 +1,4 @@
from pathlib import Path
from typing import Dict, List
from docling_core.types.doc import (
DocItemLabel,
@@ -48,8 +47,8 @@ class ReadingOrderModel:
def _assembled_to_readingorder_elements(
self, conv_res: ConversionResult
) -> List[ReadingOrderPageElement]:
elements: List[ReadingOrderPageElement] = []
) -> list[ReadingOrderPageElement]:
elements: list[ReadingOrderPageElement] = []
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
for element in conv_res.assembled.elements:
@@ -123,10 +122,10 @@ class ReadingOrderModel:
def _readingorder_elements_to_docling_doc(
self,
conv_res: ConversionResult,
ro_elements: List[ReadingOrderPageElement],
el_to_captions_mapping: Dict[int, List[int]],
el_to_footnotes_mapping: Dict[int, List[int]],
el_merges_mapping: Dict[int, List[int]],
ro_elements: list[ReadingOrderPageElement],
el_to_captions_mapping: dict[int, list[int]],
el_to_footnotes_mapping: dict[int, list[int]],
el_merges_mapping: dict[int, list[int]],
) -> DoclingDocument:
id_to_elem = {
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem

View File

@@ -2,7 +2,7 @@ import base64
import json
import logging
from io import BytesIO
from typing import Dict, List, Optional
from typing import Optional
import requests
from PIL import Image
@@ -19,7 +19,7 @@ def api_image_request(
prompt: str,
url: AnyUrl,
timeout: float = 20,
headers: Optional[Dict[str, str]] = None,
headers: Optional[dict[str, str]] = None,
**params,
) -> str:
img_io = BytesIO()
@@ -69,8 +69,8 @@ def api_image_request_streaming(
url: AnyUrl,
*,
timeout: float = 20,
headers: Optional[Dict[str, str]] = None,
generation_stoppers: List[GenerationStopper] = [],
headers: Optional[dict[str, str]] = None,
generation_stoppers: list[GenerationStopper] = [],
**params,
) -> str:
"""

View File

@@ -2,7 +2,6 @@ import bisect
import logging
import sys
from collections import defaultdict
from typing import Dict, List, Set, Tuple
from docling_core.types.doc import DocItemLabel, Size
from docling_core.types.doc.page import TextCell
@@ -39,7 +38,7 @@ class UnionFind:
self.parent[root_y] = root_x
self.rank[root_x] += 1
def get_groups(self) -> Dict[int, List[int]]:
def get_groups(self) -> dict[int, list[int]]:
"""Returns groups as {root: [elements]}."""
groups = defaultdict(list)
for elem in self.parent:
@@ -50,13 +49,13 @@ class UnionFind:
class SpatialClusterIndex:
"""Efficient spatial indexing for clusters using R-tree and interval trees."""
def __init__(self, clusters: List[Cluster]):
def __init__(self, clusters: list[Cluster]):
p = index.Property()
p.dimension = 2
self.spatial_index = index.Index(properties=p)
self.x_intervals = IntervalTree()
self.y_intervals = IntervalTree()
self.clusters_by_id: Dict[int, Cluster] = {}
self.clusters_by_id: dict[int, Cluster] = {}
for cluster in clusters:
self.add_cluster(cluster)
@@ -72,7 +71,7 @@ class SpatialClusterIndex:
self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
del self.clusters_by_id[cluster.id]
def find_candidates(self, bbox: BoundingBox) -> Set[int]:
def find_candidates(self, bbox: BoundingBox) -> set[int]:
"""Find potential overlapping cluster IDs using all indexes."""
spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
x_candidates = self.x_intervals.find_containing(
@@ -123,13 +122,13 @@ class IntervalTree:
"""Memory-efficient interval tree for 1D overlap queries."""
def __init__(self):
self.intervals: List[Interval] = [] # Sorted by min_val
self.intervals: list[Interval] = [] # Sorted by min_val
def insert(self, min_val: float, max_val: float, id: int):
interval = Interval(min_val, max_val, id)
bisect.insort(self.intervals, interval)
def find_containing(self, point: float) -> Set[int]:
def find_containing(self, point: float) -> set[int]:
"""Find all intervals containing the point."""
pos = bisect.bisect_left(self.intervals, point)
result = set()
@@ -196,7 +195,7 @@ class LayoutPostprocessor:
}
def __init__(
self, page: Page, clusters: List[Cluster], options: LayoutOptions
self, page: Page, clusters: list[Cluster], options: LayoutOptions
) -> None:
"""Initialize processor with page and clusters."""
@@ -219,7 +218,7 @@ class LayoutPostprocessor:
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
)
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
def postprocess(self) -> tuple[list[Cluster], list[TextCell]]:
"""Main processing pipeline."""
self.regular_clusters = self._process_regular_clusters()
self.special_clusters = self._process_special_clusters()
@@ -254,7 +253,7 @@ class LayoutPostprocessor:
return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]:
def _process_regular_clusters(self) -> list[Cluster]:
"""Process regular clusters with iterative refinement."""
clusters = [
c
@@ -311,7 +310,7 @@ class LayoutPostprocessor:
return clusters
def _process_special_clusters(self) -> List[Cluster]:
def _process_special_clusters(self) -> list[Cluster]:
special_clusters = [
c
for c in self.special_clusters
@@ -381,7 +380,7 @@ class LayoutPostprocessor:
return picture_clusters + wrapper_clusters
def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
def _handle_cross_type_overlaps(self, special_clusters) -> list[Cluster]:
"""Handle overlaps between regular and wrapper clusters before child assignment.
In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
@@ -454,7 +453,7 @@ class LayoutPostprocessor:
def _select_best_cluster_from_group(
self,
group_clusters: List[Cluster],
group_clusters: list[Cluster],
params: dict,
) -> Cluster:
"""Select best cluster from a group of overlapping clusters based on all rules."""
@@ -487,11 +486,11 @@ class LayoutPostprocessor:
def _remove_overlapping_clusters(
self,
clusters: List[Cluster],
clusters: list[Cluster],
cluster_type: str,
overlap_threshold: float = 0.8,
containment_threshold: float = 0.8,
) -> List[Cluster]:
) -> list[Cluster]:
if not clusters:
return []
@@ -544,7 +543,7 @@ class LayoutPostprocessor:
def _select_best_cluster(
self,
clusters: List[Cluster],
clusters: list[Cluster],
area_threshold: float,
conf_threshold: float,
) -> Cluster:
@@ -572,7 +571,7 @@ class LayoutPostprocessor:
return current_best if current_best else clusters[0]
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
def _deduplicate_cells(self, cells: list[TextCell]) -> list[TextCell]:
"""Ensure each cell appears only once, maintaining order of first appearance."""
seen_ids = set()
unique_cells = []
@@ -583,8 +582,8 @@ class LayoutPostprocessor:
return unique_cells
def _assign_cells_to_clusters(
self, clusters: List[Cluster], min_overlap: float = 0.2
) -> List[Cluster]:
self, clusters: list[Cluster], min_overlap: float = 0.2
) -> list[Cluster]:
"""Assign cells to best overlapping cluster."""
for cluster in clusters:
cluster.cells = []
@@ -616,7 +615,7 @@ class LayoutPostprocessor:
return clusters
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
def _find_unassigned_cells(self, clusters: list[Cluster]) -> list[TextCell]:
"""Find cells not assigned to any cluster."""
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
return [
@@ -625,7 +624,7 @@ class LayoutPostprocessor:
if cell.index not in assigned and cell.text.strip()
]
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
def _adjust_cluster_bboxes(self, clusters: list[Cluster]) -> list[Cluster]:
"""Adjust cluster bounding boxes to contain their cells."""
for cluster in clusters:
if not cluster.cells:
@@ -651,13 +650,13 @@ class LayoutPostprocessor:
return clusters
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
def _sort_cells(self, cells: list[TextCell]) -> list[TextCell]:
"""Sort cells in native reading order."""
return sorted(cells, key=lambda c: (c.index))
def _sort_clusters(
self, clusters: List[Cluster], mode: str = "id"
) -> List[Cluster]:
self, clusters: list[Cluster], mode: str = "id"
) -> list[Cluster]:
"""Sort clusters in reading order (top-to-bottom, left-to-right)."""
if mode == "id": # sort in the order the cells are printed in the PDF.
return sorted(

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,20 @@
# Introduction
This is the first paragraph of the introduction.
## Background
Some background information here.
Example image
<!-- image -->
- First item in unordered list
- Second item in unordered list
1. First item in ordered list
2. Second item in ordered list
42. First item in ordered list with start
43. Second item in ordered list with start

View File

@@ -1,36 +0,0 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Introduction to parsing HTML files with Docling
item-2 at level 2: picture
item-2 at level 3: caption: Docling
item-3 at level 2: text: Docling simplifies document proc ... ntegrations with the gen AI ecosystem.
item-4 at level 2: section_header: Supported file formats
item-5 at level 3: text: Docling supports multiple file formats..
item-6 at level 3: list: group list
item-7 at level 4: list_item: Advanced PDF understanding
item-8 at level 4: picture
item-8 at level 5: caption: PDF
item-9 at level 4: list_item: Microsoft Office DOCX
item-10 at level 4: picture
item-10 at level 5: caption: DOCX
item-11 at level 4: list_item: HTML files (with optional support for images)
item-12 at level 4: picture
item-12 at level 5: caption: HTML
item-13 at level 3: section_header: Three backends for handling HTML files
item-14 at level 4: text: Docling has three backends for parsing HTML files:
item-15 at level 4: list: group ordered list
item-16 at level 5: list_item:
item-17 at level 6: inline: group group
item-18 at level 7: text: HTMLDocumentBackend
item-19 at level 7: text: Ignores images
item-20 at level 5: list_item:
item-21 at level 6: inline: group group
item-22 at level 7: text: HTMLDocumentBackendImagesInline
item-23 at level 7: text: Extracts images inline
item-24 at level 5: list_item:
item-25 at level 6: inline: group group
item-26 at level 7: text: HTMLDocumentBackendImagesReferenced
item-27 at level 7: text: Extracts images as references
item-28 at level 1: caption: Docling
item-29 at level 1: caption: PDF
item-30 at level 1: caption: DOCX
item-31 at level 1: caption: HTML

View File

@@ -1,560 +0,0 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "example_09",
"origin": {
"mimetype": "text/html",
"binary_hash": 6785336133244366107,
"filename": "example_09.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/8"
},
{
"$ref": "#/texts/10"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/3"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/pictures/1"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/pictures/2"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/pictures/3"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/11"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/16"
},
{
"$ref": "#/texts/19"
}
],
"content_layer": "body",
"name": "ordered list",
"label": "list"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/texts/13"
},
"children": [
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/texts/16"
},
"children": [
{
"$ref": "#/texts/17"
},
{
"$ref": "#/texts/18"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/texts/19"
},
"children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Introduction to parsing HTML files with Docling",
"text": "Introduction to parsing HTML files with Docling"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "caption",
"prov": [],
"orig": "Docling",
"text": "Docling"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem.",
"text": "Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem."
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/texts/4"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/11"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Supported file formats",
"text": "Supported file formats",
"level": 1
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/texts/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Docling supports multiple file formats..",
"text": "Docling supports multiple file formats.."
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Advanced PDF understanding",
"text": "Advanced PDF understanding",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "caption",
"prov": [],
"orig": "PDF",
"text": "PDF"
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Microsoft Office DOCX",
"text": "Microsoft Office DOCX",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "caption",
"prov": [],
"orig": "DOCX",
"text": "DOCX"
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "HTML files (with optional support for images)",
"text": "HTML files (with optional support for images)",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "caption",
"prov": [],
"orig": "HTML",
"text": "HTML"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/texts/3"
},
"children": [
{
"$ref": "#/texts/12"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Three backends for handling HTML files",
"text": "Three backends for handling HTML files",
"level": 2
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/texts/11"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Docling has three backends for parsing HTML files:",
"text": "Docling has three backends for parsing HTML files:"
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/2"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": true,
"marker": ""
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "HTMLDocumentBackend",
"text": "HTMLDocumentBackend",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Ignores images",
"text": "Ignores images"
},
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": true,
"marker": ""
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "HTMLDocumentBackendImagesInline",
"text": "HTMLDocumentBackendImagesInline",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Extracts images inline",
"text": "Extracts images inline"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": true,
"marker": ""
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "HTMLDocumentBackendImagesReferenced",
"text": "HTMLDocumentBackendImagesReferenced",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Extracts images as references",
"text": "Extracts images as references"
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [
{
"$ref": "#/texts/1"
}
],
"references": [],
"footnotes": [],
"annotations": []
},
{
"self_ref": "#/pictures/1",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [
{
"$ref": "#/texts/6"
}
],
"references": [],
"footnotes": [],
"annotations": []
},
{
"self_ref": "#/pictures/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [
{
"$ref": "#/texts/8"
}
],
"references": [],
"footnotes": [],
"annotations": []
},
{
"self_ref": "#/pictures/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [
{
"$ref": "#/texts/10"
}
],
"references": [],
"footnotes": [],
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -1,32 +0,0 @@
# Introduction to parsing HTML files with Docling
Docling
<!-- image -->
Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem.
## Supported file formats
Docling supports multiple file formats..
- Advanced PDF understanding
PDF
<!-- image -->
- Microsoft Office DOCX
DOCX
<!-- image -->
- HTML files (with optional support for images)
HTML
<!-- image -->
### Three backends for handling HTML files
Docling has three backends for parsing HTML files:
1. **HTMLDocumentBackend** Ignores images
2. **HTMLDocumentBackendImagesInline** Extracts images inline
3. **HTMLDocumentBackendImagesReferenced** Extracts images as references

View File

@@ -17,6 +17,12 @@
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/groups/0"
}
@@ -33,7 +39,7 @@
},
"children": [
{
"$ref": "#/texts/0"
"$ref": "#/texts/1"
}
],
"content_layer": "body",
@@ -44,6 +50,18 @@
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "caption",
"prov": [],
"orig": "Image alt text",
"text": "Image alt text"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/groups/0"
},
@@ -57,7 +75,26 @@
"level": 1
}
],
"pictures": [],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "picture",
"prov": [],
"captions": [
{
"$ref": "#/texts/0"
}
],
"references": [],
"footnotes": [],
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"form_items": [],

View File

@@ -1,7 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: caption: Image Hyperlink.
item-1 at level 1: caption: Clickable Example
item-2 at level 1: picture
item-2 at level 2: caption: Image Hyperlink.
item-2 at level 2: caption: Clickable Example
item-3 at level 1: caption: This is an example caption for the image.
item-4 at level 1: picture
item-4 at level 2: caption: This is an example caption for the image.

View File

@@ -66,8 +66,8 @@
"content_layer": "body",
"label": "caption",
"prov": [],
"orig": "Image Hyperlink.",
"text": "Image Hyperlink.",
"orig": "Clickable Example",
"text": "Clickable Example",
"hyperlink": "https://www.example.com/"
},
{

View File

@@ -1,4 +1,4 @@
Image Hyperlink.
Clickable Example
<!-- image -->

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -222,6 +222,10 @@ move to sidebar
hide
Page semi-protected
<!-- image -->
From Wikipedia, the free encyclopedia
(Redirected from [Duckling](/w/index.php?title=Duckling&redirect=no) )
@@ -233,10 +237,10 @@ This article is about the bird. For duck as a food, see [Duck as food](/wiki/Duc
"Duckling" redirects here. For other uses, see [Duckling (disambiguation)](/wiki/Duckling_(disambiguation)) .
| Duck | Duck |
|----------------------------------------------------------|---------------------------|
| | |
|-------------------------------------------------------------------------------------------------|--------------|
| <!-- image --> | |
| [Bufflehead](/wiki/Bufflehead) *Bucephala albeola* ( ) | |
| Scientific classification | Scientific classification |
| [Scientific classification](/wiki/Taxonomy_(biology)) Edit this classification <!-- image --> | |
| Domain: | Eukaryota |
| Kingdom: | Animalia |
| Phylum: | Chordata |
@@ -509,8 +513,8 @@ The 1992 Disney film [*The Mighty Ducks*](/wiki/The_Mighty_Ducks_(film)) , starr
- [Ducks on postage stamps](http://www.stampsbook.org/subject/Duck.html) [Archived](https://web.archive.org/web/20130513022903/http://www.stampsbook.org/subject/Duck.html) 2013-05-13 at the [Wayback Machine](/wiki/Wayback_Machine)
- [*Ducks at a Distance, by Rob Hines*](https://gutenberg.org/ebooks/18884) at [Project Gutenberg](/wiki/Project_Gutenberg) - A modern illustrated guide to identification of US waterfowl
| Authority control databases | Authority control databases |
|-------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [Authority control databases](/wiki/Help:Authority_control) Edit this at Wikidata <!-- image --> | |
|------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| National | - [United States](https://id.loc.gov/authorities/sh85039879) - [France](https://catalogue.bnf.fr/ark:/12148/cb119761481) - [BnF data](https://data.bnf.fr/ark:/12148/cb119761481) - [Japan](https://id.ndl.go.jp/auth/ndlna/00564819) - [Latvia](https://kopkatalogs.lv/F?func=direct&local_base=lnc10&doc_number=000090751&P_CON_LNG=ENG) - [Israel](http://olduli.nli.org.il/F/?func=find-b&local_base=NLX10&find_code=UID&request=987007565486205171) |
| Other | - [IdRef](https://www.idref.fr/027796124) |

View File

@@ -4,7 +4,7 @@
<p>This is the first paragraph of the introduction.</p>
<h2>Background</h2>
<p>Some background information here.</p>
<img src="image1.png" alt="Example image"/>
<img src="example_image_01.png" alt="Example image"/>
<ul>
<li>First item in unordered list</li>
<li>Second item in unordered list</li>

View File

@@ -1,21 +0,0 @@
<html>
<body>
<h1>Introduction to parsing HTML files with <img src="https://docling-project.github.io/docling/assets/logo.png" alt="Docling" height="64"> Docling</h1>
<p>Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.</p>
<h2>Supported file formats</h2>
<p>Docling supports multiple file formats..</p>
<ul>
<li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/pdf.png" height="32" alt="PDF">Advanced PDF understanding</li>
<li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/docx.png" height="32" alt="DOCX">Microsoft Office DOCX</li>
<li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/html.png" height="32" alt="HTML">HTML files (with optional support for images)</li>
</ul>
<h3>Three backends for handling HTML files</h3>
<p>Docling has three backends for parsing HTML files:</p>
<ol>
<li><b>HTMLDocumentBackend</b> Ignores images</li>
<li><b>HTMLDocumentBackendImagesInline</b> Extracts images inline</li>
<li><b>HTMLDocumentBackendImagesReferenced</b> Extracts images as references</li>
</ol>
</body>
</html>

BIN
tests/data/html/example_image_01.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 548 KiB

View File

@@ -1,9 +1,14 @@
from io import BytesIO
from pathlib import Path
from pathlib import Path, PurePath
from unittest.mock import Mock, mock_open, patch
import pytest
from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ContentLayer
from pydantic import AnyUrl, ValidationError
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.backend_options import HTMLBackendOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
ConversionResult,
@@ -11,7 +16,7 @@ from docling.datamodel.document import (
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from docling.document_converter import DocumentConverter, HTMLFormatOption
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
@@ -19,6 +24,68 @@ from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_html_backend_options():
options = HTMLBackendOptions()
assert options.kind == "html"
assert not options.fetch_images
assert options.source_uri is None
url = "http://example.com"
source_location = AnyUrl(url=url)
options = HTMLBackendOptions(source_uri=source_location)
assert options.source_uri == source_location
source_location = PurePath("/local/path/to/file.html")
options = HTMLBackendOptions(source_uri=source_location)
assert options.source_uri == source_location
with pytest.raises(ValidationError, match="Input is not a valid path"):
HTMLBackendOptions(source_uri=12345)
def test_resolve_relative_path():
html_path = Path("./tests/data/html/example_01.html")
in_doc = InputDocument(
path_or_stream=html_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
html_doc = HTMLDocumentBackend(path_or_stream=html_path, in_doc=in_doc)
html_doc.base_path = "/local/path/to/file.html"
relative_path = "subdir/another.html"
expected_abs_loc = "/local/path/to/subdir/another.html"
assert html_doc._resolve_relative_path(relative_path) == expected_abs_loc
absolute_path = "/absolute/path/to/file.html"
assert html_doc._resolve_relative_path(absolute_path) == absolute_path
html_doc.base_path = "http://my_host.com"
protocol_relative_url = "//example.com/file.html"
expected_abs_loc = "https://example.com/file.html"
assert html_doc._resolve_relative_path(protocol_relative_url) == expected_abs_loc
html_doc.base_path = "http://example.com"
remote_relative_path = "subdir/file.html"
expected_abs_loc = "http://example.com/subdir/file.html"
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
html_doc.base_path = "http://example.com"
remote_relative_path = "https://my_host.com/my_page.html"
expected_abs_loc = "https://my_host.com/my_page.html"
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
html_doc.base_path = "http://example.com"
remote_relative_path = "/static/images/my_image.png"
expected_abs_loc = "http://example.com/static/images/my_image.png"
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
html_doc.base_path = None
relative_path = "subdir/file.html"
assert html_doc._resolve_relative_path(relative_path) == relative_path
def test_heading_levels():
in_path = Path("tests/data/html/wiki_duck.html")
in_doc = InputDocument(
@@ -158,8 +225,6 @@ def test_e2e_html_conversions():
converter = get_converter()
for html_path in html_paths:
# print(f"converting {html_path}")
gt_path = (
html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name
)
@@ -183,6 +248,76 @@ def test_e2e_html_conversions():
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
@patch("docling.backend.html_backend.requests.get")
@patch("docling.backend.html_backend.open", new_callable=mock_open)
def test_e2e_html_conversion_with_images(mock_local, mock_remote):
source = "tests/data/html/example_01.html"
image_path = "tests/data/html/example_image_01.png"
with open(image_path, "rb") as f:
img_bytes = f.read()
# fetching image locally
mock_local.return_value.__enter__.return_value = BytesIO(img_bytes)
backend_options = HTMLBackendOptions(
enable_local_fetch=True, fetch_images=True, source_uri=source
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
res_local = converter.convert(source)
mock_local.assert_called_once()
assert res_local.document
num_pic: int = 0
for element, _ in res_local.document.iterate_items():
if isinstance(element, PictureItem):
assert element.image
num_pic += 1
assert num_pic == 1, "No embedded picture was found in the converted file"
# fetching image remotely
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.content = img_bytes
mock_remote.return_value = mock_resp
source_location = "https://example.com/example_01.html"
backend_options = HTMLBackendOptions(
enable_remote_fetch=True, fetch_images=True, source_uri=source_location
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
res_remote = converter.convert(source)
mock_remote.assert_called_once_with(
"https://example.com/example_image_01.png", stream=True
)
assert res_remote.document
num_pic = 0
for element, _ in res_remote.document.iterate_items():
if isinstance(element, PictureItem):
assert element.image
assert element.image.mimetype == "image/png"
num_pic += 1
assert num_pic == 1, "No embedded picture was found in the converted file"
# both methods should generate the same DoclingDocument
assert res_remote.document == res_local.document
# checking exported formats
gt_path = (
"tests/data/groundtruth/docling_v2/" + str(Path(source).stem) + "_images.html"
)
pred_md: str = res_local.document.export_to_markdown()
assert verify_export(pred_md, gt_path + ".md", generate=GENERATE)
assert verify_document(res_local.document, gt_path + ".json", GENERATE)
def test_html_furniture():
raw_html = (
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
@@ -211,3 +346,98 @@ def test_html_furniture():
"Initial content with some **bold text**\n\n# Main Heading\n\nSome Content\n\n"
"Some Footer Content"
)
def test_fetch_remote_images(monkeypatch):
source = "./tests/data/html/example_01.html"
# no image fetching: the image_fetch flag is False
backend_options = HTMLBackendOptions(
fetch_images=False, source_uri="http://example.com"
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with patch("docling.backend.html_backend.requests.get") as mocked_get:
res = converter.convert(source)
mocked_get.assert_not_called()
assert res.document
# no image fetching: the source location is False and enable_local_fetch is False
backend_options = HTMLBackendOptions(fetch_images=True)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.requests.get") as mocked_get,
pytest.warns(
match="Fetching local resources is only allowed when set explicitly"
),
):
res = converter.convert(source)
mocked_get.assert_not_called()
assert res.document
# no image fetching: the enable_remote_fetch is False
backend_options = HTMLBackendOptions(
fetch_images=True, source_uri="http://example.com"
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.requests.get") as mocked_get,
pytest.warns(
match="Fetching remote resources is only allowed when set explicitly"
),
):
res = converter.convert(source)
mocked_get.assert_not_called()
assert res.document
# image fetching: all conditions apply, source location is remote
backend_options = HTMLBackendOptions(
enable_remote_fetch=True, fetch_images=True, source_uri="http://example.com"
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.requests.get") as mocked_get,
pytest.warns(match="a bytes-like object is required"),
):
res = converter.convert(source)
mocked_get.assert_called_once()
assert res.document
# image fetching: all conditions apply, local fetching allowed
backend_options = HTMLBackendOptions(
enable_local_fetch=True, fetch_images=True, source_uri=source
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.open") as mocked_open,
pytest.warns(match="a bytes-like object is required"),
):
res = converter.convert(source)
mocked_open.assert_called_once_with(
"tests/data/html/example_image_01.png", "rb"
)
assert res.document

View File

@@ -6,13 +6,12 @@ from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from tests.verify_utils import CONFID_PREC, COORD_PREC
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
from .verify_utils import verify_document
GENERATE = GEN_TEST_DATA

View File

@@ -1,10 +1,19 @@
from io import BytesIO
from pathlib import Path
import pytest
from pydantic import ValidationError
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.backend_options import (
BaseBackendOptions,
DeclarativeBackendOptions,
HTMLBackendOptions,
)
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.datamodel.settings import DocumentLimits
@@ -15,6 +24,7 @@ def test_in_doc_from_valid_path():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path)
assert doc.valid is True
assert doc.backend_options is None
def test_in_doc_from_invalid_path():
@@ -105,6 +115,38 @@ def test_in_doc_with_page_range():
assert doc.valid is False
def test_in_doc_with_backend_options():
test_doc_path = Path("./tests/data/html/example_01.html")
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
backend_options=HTMLBackendOptions(),
)
assert doc.valid
assert doc.backend_options
assert isinstance(doc.backend_options, HTMLBackendOptions)
assert not doc.backend_options.fetch_images
assert not doc.backend_options.enable_local_fetch
assert not doc.backend_options.enable_remote_fetch
with pytest.raises(ValueError, match="Incompatible types"):
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
backend_options=DeclarativeBackendOptions(),
)
with pytest.raises(ValidationError):
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
backend_options=BaseBackendOptions(),
)
def test_guess_format(tmp_path):
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
dci = _DocumentConversionInput(path_or_stream_iterator=[])