mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(backend): add generic options support and HTML image handling modes (#2011)
* feat: add backend options support to document backends Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * feat: enhance document backends with generic backend options and improve HTML image handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * Refactor tests for declarativebackend Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): improve image caption handling and ensure backend options are set correctly Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix: enhance HTML backend image handling and add support for local file paths Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: Add ground truth data for test data Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): skip loading SVG files in image data handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): simplify backend options and address gaps Backend options for DeclarativeDocumentBackend classes and only when necessary. Refactor caption parsing in 'img' elements and remove dummy text. Replace deprecated annotations from Typing library with native types. Replace typing annotations according to pydantic guidelines. Some documentation with pydantic annotations. Fix diff issue with test files. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(html): add tests and fix bugs Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): refactor backend options Move backend option classes to its own module within datamodel package. Rename 'source_location' with 'source_uri' in HTMLBackendOptions. Rename 'image_fetch' with 'fetch_images' in HTMLBackendOptions. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(markdown): create a class for the markdown backend options Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
@@ -1,10 +1,12 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Set, Union
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
@@ -35,7 +37,7 @@ class AbstractDocumentBackend(ABC):
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def supported_formats(cls) -> Set["InputFormat"]:
|
||||
def supported_formats(cls) -> set["InputFormat"]:
|
||||
pass
|
||||
|
||||
|
||||
@@ -58,6 +60,20 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
||||
straight without a recognition pipeline.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: BackendOptions = DeclarativeBackendOptions(),
|
||||
) -> None:
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.options: BackendOptions = options
|
||||
|
||||
@abstractmethod
|
||||
def convert(self) -> DoclingDocument:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> BackendOptions:
|
||||
return DeclarativeBackendOptions()
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Set, Union
|
||||
from typing import Final, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128
|
||||
|
||||
|
||||
class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self.path_or_stream = path_or_stream
|
||||
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.ASCIIDOC}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Optional, Union, cast
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||
from bs4.element import PreformattedString
|
||||
from docling_core.types.doc import (
|
||||
@@ -17,6 +20,7 @@ from docling_core.types.doc import (
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
PictureItem,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
TableCell,
|
||||
@@ -24,13 +28,18 @@ from docling_core.types.doc import (
|
||||
TableItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer, Formatting, Script
|
||||
from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from pydantic import AnyUrl, BaseModel, ValidationError
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.backend_options import HTMLBackendOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docling.exceptions import OperationNotAllowed
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -43,6 +52,7 @@ _BLOCK_TAGS: Final = {
|
||||
"details",
|
||||
"figure",
|
||||
"footer",
|
||||
"img",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
@@ -186,11 +196,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self,
|
||||
in_doc: InputDocument,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
original_url: Optional[AnyUrl] = None,
|
||||
options: HTMLBackendOptions = HTMLBackendOptions(),
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
self.soup: Optional[Tag] = None
|
||||
self.path_or_stream = path_or_stream
|
||||
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
||||
self.base_path: Optional[str] = str(options.source_uri)
|
||||
|
||||
# Initialize the parents for the hierarchy
|
||||
self.max_levels = 10
|
||||
@@ -200,7 +211,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
self.hyperlink: Union[AnyUrl, Path, None] = None
|
||||
self.original_url = original_url
|
||||
self.format_tags: list[str] = []
|
||||
|
||||
try:
|
||||
@@ -236,6 +246,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.HTML}
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def get_default_options(cls) -> HTMLBackendOptions:
|
||||
return HTMLBackendOptions()
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
_log.debug("Starting HTML conversion...")
|
||||
@@ -261,7 +276,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
content_layer=ContentLayer.FURNITURE,
|
||||
)
|
||||
# remove script and style tags
|
||||
for tag in self.soup(["script", "style"]):
|
||||
for tag in self.soup(["script", "noscript", "style"]):
|
||||
tag.decompose()
|
||||
# remove any hidden tag
|
||||
for tag in self.soup(hidden=True):
|
||||
@@ -291,6 +306,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._walk(content, doc)
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _is_remote_url(value: str) -> bool:
|
||||
parsed = urlparse(value)
|
||||
return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
|
||||
|
||||
def _resolve_relative_path(self, loc: str) -> str:
|
||||
abs_loc = loc
|
||||
|
||||
if self.base_path:
|
||||
if loc.startswith("//"):
|
||||
# Protocol-relative URL - default to https
|
||||
abs_loc = "https:" + loc
|
||||
elif not loc.startswith(("http://", "https://", "data:", "file://")):
|
||||
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
|
||||
abs_loc = urljoin(self.base_path, loc)
|
||||
elif self.base_path: # local fetch
|
||||
# For local files, resolve relative to the HTML file location
|
||||
abs_loc = str(Path(self.base_path).parent / loc)
|
||||
|
||||
_log.debug(f"Resolved location {loc} to {abs_loc}")
|
||||
return abs_loc
|
||||
|
||||
@staticmethod
|
||||
def group_cell_elements(
|
||||
group_name: str,
|
||||
@@ -520,7 +557,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if name == "img":
|
||||
flush_buffer()
|
||||
im_ref3 = self._emit_image(node, doc)
|
||||
added_refs.append(im_ref3)
|
||||
if im_ref3:
|
||||
added_refs.append(im_ref3)
|
||||
elif name in _FORMAT_TAG_MAP:
|
||||
with self._use_format([name]):
|
||||
wk = self._walk(node, doc)
|
||||
@@ -669,8 +707,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
if isinstance(this_href, str) and this_href:
|
||||
old_hyperlink = self.hyperlink
|
||||
if self.original_url is not None:
|
||||
this_href = urljoin(str(self.original_url), str(this_href))
|
||||
this_href = self._resolve_relative_path(this_href)
|
||||
# ugly fix for relative links since pydantic does not support them.
|
||||
try:
|
||||
new_hyperlink = AnyUrl(this_href)
|
||||
@@ -837,7 +874,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
im_ref = self._emit_image(img_tag, doc)
|
||||
added_ref.append(im_ref)
|
||||
if im_ref:
|
||||
added_ref.append(im_ref)
|
||||
return added_ref
|
||||
|
||||
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
|
||||
@@ -1003,7 +1041,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
img_tag = tag.find("img")
|
||||
if isinstance(img_tag, Tag):
|
||||
im_ref = self._emit_image(img_tag, doc)
|
||||
added_refs.append(im_ref)
|
||||
if im_ref is not None:
|
||||
added_refs.append(im_ref)
|
||||
|
||||
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
||||
heading_refs = self._handle_heading(tag, doc)
|
||||
@@ -1061,7 +1100,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for img_tag in tag("img"):
|
||||
if isinstance(img_tag, Tag):
|
||||
im_ref2 = self._emit_image(tag, doc)
|
||||
added_refs.append(im_ref2)
|
||||
if im_ref2 is not None:
|
||||
added_refs.append(im_ref2)
|
||||
|
||||
elif tag_name in {"pre"}:
|
||||
# handle monospace code snippets (pre).
|
||||
@@ -1092,10 +1132,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._walk(tag, doc)
|
||||
return added_refs
|
||||
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
|
||||
figure = img_tag.find_parent("figure")
|
||||
caption: AnnotatedTextList = AnnotatedTextList()
|
||||
|
||||
parent = self.parents[self.level]
|
||||
|
||||
# check if the figure has a link - this is HACK:
|
||||
def get_img_hyperlink(img_tag):
|
||||
this_parent = img_tag.parent
|
||||
@@ -1106,9 +1148,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return None
|
||||
|
||||
if img_hyperlink := get_img_hyperlink(img_tag):
|
||||
caption.append(
|
||||
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
|
||||
)
|
||||
img_text = img_tag.get("alt") or ""
|
||||
caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
|
||||
|
||||
if isinstance(figure, Tag):
|
||||
caption_tag = figure.find("figcaption", recursive=False)
|
||||
@@ -1135,13 +1176,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
hyperlink=caption_anno_text.hyperlink,
|
||||
)
|
||||
|
||||
src_loc: str = self._get_attr_as_string(img_tag, "src")
|
||||
if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
|
||||
# Do not fetch the image, just add a placeholder
|
||||
placeholder: PictureItem = doc.add_picture(
|
||||
caption=caption_item,
|
||||
parent=parent,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
return placeholder.get_ref()
|
||||
|
||||
src_loc = self._resolve_relative_path(src_loc)
|
||||
img_ref = self._create_image_ref(src_loc)
|
||||
|
||||
docling_pic = doc.add_picture(
|
||||
image=img_ref,
|
||||
caption=caption_item,
|
||||
parent=self.parents[self.level],
|
||||
parent=parent,
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
return docling_pic.get_ref()
|
||||
|
||||
def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
|
||||
try:
|
||||
img_data = self._load_image_data(src_url)
|
||||
if img_data:
|
||||
img = Image.open(BytesIO(img_data))
|
||||
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
|
||||
except (
|
||||
requests.HTTPError,
|
||||
ValidationError,
|
||||
UnidentifiedImageError,
|
||||
OperationNotAllowed,
|
||||
TypeError,
|
||||
ValueError,
|
||||
) as e:
|
||||
warnings.warn(f"Could not process an image from {src_url}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _load_image_data(self, src_loc: str) -> Optional[bytes]:
|
||||
if src_loc.lower().endswith(".svg"):
|
||||
_log.debug(f"Skipping SVG file: {src_loc}")
|
||||
return None
|
||||
|
||||
if HTMLDocumentBackend._is_remote_url(src_loc):
|
||||
if not self.options.enable_remote_fetch:
|
||||
raise OperationNotAllowed(
|
||||
"Fetching remote resources is only allowed when set explicitly. "
|
||||
"Set options.enable_remote_fetch=True."
|
||||
)
|
||||
response = requests.get(src_loc, stream=True)
|
||||
response.raise_for_status()
|
||||
return response.content
|
||||
elif src_loc.startswith("data:"):
|
||||
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
|
||||
return base64.b64decode(data)
|
||||
|
||||
if src_loc.startswith("file://"):
|
||||
src_loc = src_loc[7:]
|
||||
|
||||
if not self.options.enable_local_fetch:
|
||||
raise OperationNotAllowed(
|
||||
"Fetching local resources is only allowed when set explicitly. "
|
||||
"Set options.enable_local_fetch=True."
|
||||
)
|
||||
# add check that file exists and can read
|
||||
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
|
||||
with open(src_loc, "rb") as f:
|
||||
return f.read()
|
||||
else:
|
||||
raise ValueError("File does not exist or it is not readable.")
|
||||
|
||||
@staticmethod
|
||||
def get_text(item: PageElement) -> str:
|
||||
"""Concatenate all child strings of a PageElement.
|
||||
@@ -1238,3 +1344,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
return int_spans
|
||||
|
||||
@staticmethod
|
||||
def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
|
||||
"""Get attribute value as string, handling list values."""
|
||||
value = tag.get(attr)
|
||||
if not value:
|
||||
return default
|
||||
|
||||
return value[0] if isinstance(value, list) else value
|
||||
|
||||
@@ -24,10 +24,16 @@ from docling_core.types.doc import (
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from marko import Markdown
|
||||
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
||||
from typing_extensions import Annotated
|
||||
from typing_extensions import Annotated, override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.backend_options import (
|
||||
HTMLBackendOptions,
|
||||
MarkdownBackendOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@@ -88,8 +94,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return shortened_text
|
||||
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
@override
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: InputDocument,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: MarkdownBackendOptions = MarkdownBackendOptions(),
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
|
||||
_log.debug("Starting MarkdownDocumentBackend...")
|
||||
|
||||
@@ -580,9 +592,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
format=InputFormat.HTML,
|
||||
backend=html_backend_cls,
|
||||
filename=self.file.name,
|
||||
backend_options=self.options,
|
||||
)
|
||||
html_backend_obj = html_backend_cls(
|
||||
in_doc=in_doc, path_or_stream=stream
|
||||
in_doc=in_doc,
|
||||
path_or_stream=stream,
|
||||
options=cast(HTMLBackendOptions, self.options),
|
||||
)
|
||||
doc = html_backend_obj.convert()
|
||||
else:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
from typing import Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.PPTX}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
|
||||
@@ -3,7 +3,7 @@ import re
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, List, Optional, Union
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.numbered_headers: dict[int, int] = {}
|
||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||
# Track processed textbox elements to avoid duplication
|
||||
self.processed_textbox_elements: List[int] = []
|
||||
self.processed_textbox_elements: list[int] = []
|
||||
self.docx_to_pdf_converter: Optional[Callable] = None
|
||||
self.docx_to_pdf_converter_init = False
|
||||
self.display_drawingml_warning = True
|
||||
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
textbox_elements: list,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
"""Process textbox content and add it to the document structure."""
|
||||
level = self._get_level()
|
||||
# Create a textbox group to contain all text from the textbox
|
||||
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
curr_level: Optional[int],
|
||||
text: str,
|
||||
is_numbered_style: bool = False,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
level = self._get_level()
|
||||
if isinstance(curr_level, int):
|
||||
if curr_level > level:
|
||||
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker: str,
|
||||
enumerated: bool,
|
||||
level: int,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], ListGroup):
|
||||
return elem_ref
|
||||
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
ilevel: int,
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
if not elements:
|
||||
return elem_ref
|
||||
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
) -> list[RefItem]:
|
||||
elem_ref: list[RefItem] = []
|
||||
table: Table = Table(element, docx_obj)
|
||||
num_rows = len(table.rows)
|
||||
num_cols = len(table.columns)
|
||||
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
||||
|
||||
provs_in_cell: List[RefItem] = []
|
||||
provs_in_cell: list[RefItem] = []
|
||||
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
|
||||
ref_for_rich_cell = provs_in_cell[0]
|
||||
rich_table_cell = False
|
||||
|
||||
def group_cell_elements(
|
||||
group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
|
||||
group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
|
||||
) -> RefItem:
|
||||
group_element = doc.add_group(
|
||||
label=GroupLabel.UNSPECIFIED,
|
||||
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def _handle_pictures(
|
||||
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
||||
) -> List[RefItem]:
|
||||
) -> list[RefItem]:
|
||||
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
||||
image_data: Optional[bytes] = None
|
||||
rId = drawing_blip[0].get(
|
||||
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
image_data = image_part.blob # Get the binary image data
|
||||
return image_data
|
||||
|
||||
elem_ref: List[RefItem] = []
|
||||
elem_ref: list[RefItem] = []
|
||||
level = self._get_level()
|
||||
# Open the BytesIO object with PIL to create an Image
|
||||
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
||||
|
||||
53
docling/datamodel/backend_options.py
Normal file
53
docling/datamodel/backend_options.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from pathlib import PurePath
|
||||
from typing import Annotated, Literal, Optional, Union
|
||||
|
||||
from pydantic import AnyUrl, BaseModel, Field
|
||||
|
||||
|
||||
class BaseBackendOptions(BaseModel):
|
||||
"""Common options for all declarative document backends."""
|
||||
|
||||
enable_remote_fetch: bool = Field(
|
||||
False, description="Enable remote resource fetching."
|
||||
)
|
||||
enable_local_fetch: bool = Field(
|
||||
False, description="Enable local resource fetching."
|
||||
)
|
||||
|
||||
|
||||
class DeclarativeBackendOptions(BaseBackendOptions):
|
||||
"""Default backend options for a declarative document backend."""
|
||||
|
||||
kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
|
||||
|
||||
|
||||
class HTMLBackendOptions(BaseBackendOptions):
|
||||
"""Options specific to the HTML backend.
|
||||
|
||||
This class can be extended to include options specific to HTML processing.
|
||||
"""
|
||||
|
||||
kind: Literal["html"] = Field("html", exclude=True, repr=False)
|
||||
fetch_images: bool = Field(
|
||||
False,
|
||||
description=(
|
||||
"Whether the backend should access remote or local resources to parse "
|
||||
"images in an HTML document."
|
||||
),
|
||||
)
|
||||
source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
|
||||
None,
|
||||
description=(
|
||||
"The URI that originates the HTML document. If provided, the backend "
|
||||
"will use it to resolve relative paths in the HTML document."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class MarkdownBackendOptions(HTMLBackendOptions):
|
||||
"""Options specific to the Markdown backend."""
|
||||
|
||||
|
||||
BackendOptions = Annotated[
|
||||
Union[DeclarativeBackendOptions, HTMLBackendOptions], Field(discriminator="kind")
|
||||
]
|
||||
@@ -8,14 +8,12 @@ from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
List,
|
||||
Annotated,
|
||||
Literal,
|
||||
Optional,
|
||||
Set,
|
||||
Type,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
import filetype
|
||||
@@ -54,8 +52,10 @@ from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
DeclarativeDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.backend_options import BackendOptions
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConfidenceReport,
|
||||
@@ -74,6 +74,7 @@ from docling.utils.utils import create_file_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import BaseFormatOption
|
||||
from docling.document_converter import FormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -102,32 +103,58 @@ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath
|
||||
document_hash: str # = None
|
||||
valid: bool = True
|
||||
limits: DocumentLimits = DocumentLimits()
|
||||
format: InputFormat # = None
|
||||
"""A document as an input of a Docling conversion."""
|
||||
|
||||
filesize: Optional[int] = None
|
||||
page_count: int = 0
|
||||
file: Annotated[
|
||||
PurePath, Field(description="A path representation the input document.")
|
||||
]
|
||||
document_hash: Annotated[
|
||||
str,
|
||||
Field(description="A stable hash of the path or stream of the input document."),
|
||||
]
|
||||
valid: bool = Field(True, description="Whether this is is a valid input document.")
|
||||
backend_options: Optional[BackendOptions] = Field(
|
||||
None, description="Custom options for declarative backends."
|
||||
)
|
||||
limits: DocumentLimits = Field(
|
||||
DocumentLimits(), description="Limits in the input document for the conversion."
|
||||
)
|
||||
format: Annotated[InputFormat, Field(description="The document format.")]
|
||||
|
||||
_backend: AbstractDocumentBackend # Internal PDF backend used
|
||||
filesize: Optional[int] = Field(
|
||||
None, description="Size of the input file, in bytes."
|
||||
)
|
||||
page_count: int = Field(0, description="Number of pages in the input document.")
|
||||
|
||||
_backend: AbstractDocumentBackend
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
format: InputFormat,
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
backend_options: Optional[BackendOptions] = None,
|
||||
filename: Optional[str] = None,
|
||||
limits: Optional[DocumentLimits] = None,
|
||||
):
|
||||
) -> None:
|
||||
super().__init__(
|
||||
file="", document_hash="", format=InputFormat.PDF
|
||||
file="",
|
||||
document_hash="",
|
||||
format=InputFormat.PDF,
|
||||
backend_options=backend_options,
|
||||
) # initialize with dummy values
|
||||
|
||||
self.limits = limits or DocumentLimits()
|
||||
self.format = format
|
||||
|
||||
# check for backend incompatibilities
|
||||
if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
|
||||
if not issubclass(
|
||||
type(backend_options), type(backend.get_default_options())
|
||||
):
|
||||
raise ValueError(
|
||||
"Incompatible types between backend and backend_options arguments."
|
||||
)
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
self.file = path_or_stream
|
||||
@@ -140,7 +167,8 @@ class InputDocument(BaseModel):
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
assert filename is not None, (
|
||||
"Can't construct InputDocument from stream without providing filename arg."
|
||||
"Can't construct InputDocument from stream without providing "
|
||||
"filename arg."
|
||||
)
|
||||
self.file = PurePath(filename)
|
||||
self.filesize = path_or_stream.getbuffer().nbytes
|
||||
@@ -175,7 +203,8 @@ class InputDocument(BaseModel):
|
||||
except RuntimeError as e:
|
||||
self.valid = False
|
||||
_log.exception(
|
||||
f"An unexpected error occurred while opening the document {self.file.name}",
|
||||
"An unexpected error occurred while opening the document "
|
||||
"f{self.file.name}",
|
||||
exc_info=e,
|
||||
)
|
||||
# raise
|
||||
@@ -185,7 +214,15 @@ class InputDocument(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
|
||||
self._backend = backend(
|
||||
self,
|
||||
path_or_stream=path_or_stream,
|
||||
options=self.backend_options,
|
||||
)
|
||||
else:
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
|
||||
if not self._backend.is_valid():
|
||||
self.valid = False
|
||||
|
||||
@@ -199,11 +236,11 @@ class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||
errors: List[ErrorItem] = [] # structure to keep errors
|
||||
errors: list[ErrorItem] = [] # structure to keep errors
|
||||
|
||||
pages: List[Page] = []
|
||||
pages: list[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
timings: dict[str, ProfilingItem] = {}
|
||||
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
||||
|
||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
@@ -222,7 +259,7 @@ class _DummyBackend(AbstractDocumentBackend):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return set()
|
||||
|
||||
@classmethod
|
||||
@@ -235,7 +272,7 @@ class _DummyBackend(AbstractDocumentBackend):
|
||||
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
headers: Optional[dict[str, str]] = None
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
||||
def docs(
|
||||
@@ -250,33 +287,36 @@ class _DocumentConversionInput(BaseModel):
|
||||
)
|
||||
format = self._guess_format(obj)
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
if format not in format_options.keys():
|
||||
backend_options: Optional[BackendOptions] = None
|
||||
if not format or format not in format_options:
|
||||
_log.error(
|
||||
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
|
||||
f"Input document {obj.name} with format {format} does not match "
|
||||
f"any allowed format: ({format_options.keys()})"
|
||||
)
|
||||
backend = _DummyBackend
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
options = format_options[format]
|
||||
backend = options.backend
|
||||
if "backend_options" in options.model_fields_set:
|
||||
backend_options = cast("FormatOption", options).backend_options
|
||||
|
||||
path_or_stream: Union[BytesIO, Path]
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj,
|
||||
format=format, # type: ignore[arg-type]
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
path_or_stream = obj
|
||||
elif isinstance(obj, DocumentStream):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
format=format, # type: ignore[arg-type]
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
path_or_stream = obj.stream
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||
|
||||
yield InputDocument(
|
||||
path_or_stream=path_or_stream,
|
||||
format=format, # type: ignore[arg-type]
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
backend_options=backend_options,
|
||||
)
|
||||
|
||||
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
||||
content = b"" # empty binary blob
|
||||
formats: list[InputFormat] = []
|
||||
@@ -290,12 +330,13 @@ class _DocumentConversionInput(BaseModel):
|
||||
with obj.open("rb") as f:
|
||||
content = f.read(1024) # Read first 1KB
|
||||
if mime is not None and mime.lower() == "application/zip":
|
||||
mime_root = "application/vnd.openxmlformats-officedocument"
|
||||
if obj.suffixes[-1].lower() == ".xlsx":
|
||||
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
mime = mime_root + ".spreadsheetml.sheet"
|
||||
elif obj.suffixes[-1].lower() == ".docx":
|
||||
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
mime = mime_root + ".wordprocessingml.document"
|
||||
elif obj.suffixes[-1].lower() == ".pptx":
|
||||
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
mime = mime_root + ".presentationml.presentation"
|
||||
|
||||
elif isinstance(obj, DocumentStream):
|
||||
content = obj.stream.read(8192)
|
||||
@@ -310,12 +351,13 @@ class _DocumentConversionInput(BaseModel):
|
||||
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
|
||||
if mime is not None and mime.lower() == "application/zip":
|
||||
objname = obj.name.lower()
|
||||
mime_root = "application/vnd.openxmlformats-officedocument"
|
||||
if objname.endswith(".xlsx"):
|
||||
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
mime = mime_root + ".spreadsheetml.sheet"
|
||||
elif objname.endswith(".docx"):
|
||||
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
mime = mime_root + ".wordprocessingml.document"
|
||||
elif objname.endswith(".pptx"):
|
||||
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
mime = mime_root + ".presentationml.presentation"
|
||||
|
||||
if mime is not None and mime.lower() == "application/gzip":
|
||||
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
|
||||
|
||||
@@ -9,11 +9,14 @@ from datetime import datetime
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
from pydantic import ConfigDict, model_validator, validate_call
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
)
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.csv_backend import CsvDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
@@ -28,6 +31,7 @@ from docling.backend.noop_backend import NoOpBackend
|
||||
from docling.backend.webvtt_backend import WebVTTDocumentBackend
|
||||
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
|
||||
from docling.datamodel.base_models import (
|
||||
BaseFormatOption,
|
||||
ConversionStatus,
|
||||
@@ -61,11 +65,13 @@ _PIPELINE_CACHE_LOCK = threading.Lock()
|
||||
|
||||
class FormatOption(BaseFormatOption):
|
||||
pipeline_cls: Type[BasePipeline]
|
||||
backend_options: Optional[BackendOptions] = None
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_optional_field_default(self) -> "FormatOption":
|
||||
def set_optional_field_default(self) -> Self:
|
||||
if self.pipeline_options is None:
|
||||
self.pipeline_options = self.pipeline_cls.get_default_options()
|
||||
|
||||
return self
|
||||
|
||||
|
||||
@@ -92,6 +98,7 @@ class PowerpointFormatOption(FormatOption):
|
||||
class MarkdownFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
||||
backend_options: HTMLBackendOptions = HTMLBackendOptions()
|
||||
|
||||
|
||||
class AsciiDocFormatOption(FormatOption):
|
||||
@@ -102,6 +109,7 @@ class AsciiDocFormatOption(FormatOption):
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
backend_options: HTMLBackendOptions = HTMLBackendOptions()
|
||||
|
||||
|
||||
class PatentUsptoFormatOption(FormatOption):
|
||||
@@ -150,7 +158,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
pipeline_cls=SimplePipeline,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=HTMLBackendOptions(),
|
||||
),
|
||||
InputFormat.XML_USPTO: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||
@@ -186,13 +196,13 @@ class DocumentConverter:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||
allowed_formats: Optional[list[InputFormat]] = None,
|
||||
format_options: Optional[dict[InputFormat, FormatOption]] = None,
|
||||
):
|
||||
self.allowed_formats = (
|
||||
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||
)
|
||||
self.format_to_options: Dict[InputFormat, FormatOption] = {
|
||||
self.format_to_options: dict[InputFormat, FormatOption] = {
|
||||
format: (
|
||||
_get_default_option(format=format)
|
||||
if (custom_option := (format_options or {}).get(format)) is None
|
||||
@@ -200,8 +210,8 @@ class DocumentConverter:
|
||||
)
|
||||
for format in self.allowed_formats
|
||||
}
|
||||
self.initialized_pipelines: Dict[
|
||||
Tuple[Type[BasePipeline], str], BasePipeline
|
||||
self.initialized_pipelines: dict[
|
||||
tuple[Type[BasePipeline], str], BasePipeline
|
||||
] = {}
|
||||
|
||||
def _get_initialized_pipelines(
|
||||
@@ -228,7 +238,7 @@ class DocumentConverter:
|
||||
def convert(
|
||||
self,
|
||||
source: Union[Path, str, DocumentStream], # TODO review naming
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
@@ -248,7 +258,7 @@ class DocumentConverter:
|
||||
def convert_all(
|
||||
self,
|
||||
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
|
||||
@@ -8,9 +8,10 @@ from collections.abc import Iterable, Iterator
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from pydantic import ConfigDict, model_validator, validate_call
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
@@ -56,7 +57,7 @@ class ExtractionFormatOption(BaseFormatOption):
|
||||
pipeline_cls: Type[BaseExtractionPipeline]
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_optional_field_default(self) -> "ExtractionFormatOption":
|
||||
def set_optional_field_default(self) -> Self:
|
||||
if self.pipeline_options is None:
|
||||
# `get_default_options` comes from BaseExtractionPipeline
|
||||
self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
|
||||
@@ -70,7 +71,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
|
||||
the VLM extractor. This duplication will be removed when we deduplicate
|
||||
the format registry between convert/extract.
|
||||
"""
|
||||
format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
InputFormat.IMAGE: PyPdfiumDocumentBackend,
|
||||
InputFormat.PDF: PyPdfiumDocumentBackend,
|
||||
}
|
||||
@@ -98,24 +99,24 @@ class DocumentExtractor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
allowed_formats: Optional[list[InputFormat]] = None,
|
||||
extraction_format_options: Optional[
|
||||
Dict[InputFormat, ExtractionFormatOption]
|
||||
dict[InputFormat, ExtractionFormatOption]
|
||||
] = None,
|
||||
) -> None:
|
||||
self.allowed_formats: List[InputFormat] = (
|
||||
self.allowed_formats: list[InputFormat] = (
|
||||
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||
)
|
||||
# Build per-format options with defaults, then apply any user overrides
|
||||
overrides = extraction_format_options or {}
|
||||
self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
|
||||
self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
|
||||
fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
|
||||
for fmt in self.allowed_formats
|
||||
}
|
||||
|
||||
# Cache pipelines by (class, options-hash)
|
||||
self._initialized_pipelines: Dict[
|
||||
Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
|
||||
self._initialized_pipelines: dict[
|
||||
tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
|
||||
] = {}
|
||||
|
||||
# ---------------------------- Public API ---------------------------------
|
||||
@@ -125,7 +126,7 @@ class DocumentExtractor:
|
||||
self,
|
||||
source: Union[Path, str, DocumentStream],
|
||||
template: ExtractionTemplateType,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
@@ -147,7 +148,7 @@ class DocumentExtractor:
|
||||
self,
|
||||
source: Iterable[Union[Path, str, DocumentStream]],
|
||||
template: ExtractionTemplateType,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@@ -48,8 +47,8 @@ class ReadingOrderModel:
|
||||
|
||||
def _assembled_to_readingorder_elements(
|
||||
self, conv_res: ConversionResult
|
||||
) -> List[ReadingOrderPageElement]:
|
||||
elements: List[ReadingOrderPageElement] = []
|
||||
) -> list[ReadingOrderPageElement]:
|
||||
elements: list[ReadingOrderPageElement] = []
|
||||
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for element in conv_res.assembled.elements:
|
||||
@@ -123,10 +122,10 @@ class ReadingOrderModel:
|
||||
def _readingorder_elements_to_docling_doc(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
ro_elements: List[ReadingOrderPageElement],
|
||||
el_to_captions_mapping: Dict[int, List[int]],
|
||||
el_to_footnotes_mapping: Dict[int, List[int]],
|
||||
el_merges_mapping: Dict[int, List[int]],
|
||||
ro_elements: list[ReadingOrderPageElement],
|
||||
el_to_captions_mapping: dict[int, list[int]],
|
||||
el_to_footnotes_mapping: dict[int, list[int]],
|
||||
el_merges_mapping: dict[int, list[int]],
|
||||
) -> DoclingDocument:
|
||||
id_to_elem = {
|
||||
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
||||
|
||||
@@ -2,7 +2,7 @@ import base64
|
||||
import json
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
@@ -19,7 +19,7 @@ def api_image_request(
|
||||
prompt: str,
|
||||
url: AnyUrl,
|
||||
timeout: float = 20,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
**params,
|
||||
) -> str:
|
||||
img_io = BytesIO()
|
||||
@@ -69,8 +69,8 @@ def api_image_request_streaming(
|
||||
url: AnyUrl,
|
||||
*,
|
||||
timeout: float = 20,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
generation_stoppers: List[GenerationStopper] = [],
|
||||
headers: Optional[dict[str, str]] = None,
|
||||
generation_stoppers: list[GenerationStopper] = [],
|
||||
**params,
|
||||
) -> str:
|
||||
"""
|
||||
|
||||
@@ -2,7 +2,6 @@ import bisect
|
||||
import logging
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, Size
|
||||
from docling_core.types.doc.page import TextCell
|
||||
@@ -39,7 +38,7 @@ class UnionFind:
|
||||
self.parent[root_y] = root_x
|
||||
self.rank[root_x] += 1
|
||||
|
||||
def get_groups(self) -> Dict[int, List[int]]:
|
||||
def get_groups(self) -> dict[int, list[int]]:
|
||||
"""Returns groups as {root: [elements]}."""
|
||||
groups = defaultdict(list)
|
||||
for elem in self.parent:
|
||||
@@ -50,13 +49,13 @@ class UnionFind:
|
||||
class SpatialClusterIndex:
|
||||
"""Efficient spatial indexing for clusters using R-tree and interval trees."""
|
||||
|
||||
def __init__(self, clusters: List[Cluster]):
|
||||
def __init__(self, clusters: list[Cluster]):
|
||||
p = index.Property()
|
||||
p.dimension = 2
|
||||
self.spatial_index = index.Index(properties=p)
|
||||
self.x_intervals = IntervalTree()
|
||||
self.y_intervals = IntervalTree()
|
||||
self.clusters_by_id: Dict[int, Cluster] = {}
|
||||
self.clusters_by_id: dict[int, Cluster] = {}
|
||||
|
||||
for cluster in clusters:
|
||||
self.add_cluster(cluster)
|
||||
@@ -72,7 +71,7 @@ class SpatialClusterIndex:
|
||||
self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
|
||||
del self.clusters_by_id[cluster.id]
|
||||
|
||||
def find_candidates(self, bbox: BoundingBox) -> Set[int]:
|
||||
def find_candidates(self, bbox: BoundingBox) -> set[int]:
|
||||
"""Find potential overlapping cluster IDs using all indexes."""
|
||||
spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
|
||||
x_candidates = self.x_intervals.find_containing(
|
||||
@@ -123,13 +122,13 @@ class IntervalTree:
|
||||
"""Memory-efficient interval tree for 1D overlap queries."""
|
||||
|
||||
def __init__(self):
|
||||
self.intervals: List[Interval] = [] # Sorted by min_val
|
||||
self.intervals: list[Interval] = [] # Sorted by min_val
|
||||
|
||||
def insert(self, min_val: float, max_val: float, id: int):
|
||||
interval = Interval(min_val, max_val, id)
|
||||
bisect.insort(self.intervals, interval)
|
||||
|
||||
def find_containing(self, point: float) -> Set[int]:
|
||||
def find_containing(self, point: float) -> set[int]:
|
||||
"""Find all intervals containing the point."""
|
||||
pos = bisect.bisect_left(self.intervals, point)
|
||||
result = set()
|
||||
@@ -196,7 +195,7 @@ class LayoutPostprocessor:
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self, page: Page, clusters: List[Cluster], options: LayoutOptions
|
||||
self, page: Page, clusters: list[Cluster], options: LayoutOptions
|
||||
) -> None:
|
||||
"""Initialize processor with page and clusters."""
|
||||
|
||||
@@ -219,7 +218,7 @@ class LayoutPostprocessor:
|
||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||
)
|
||||
|
||||
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||
def postprocess(self) -> tuple[list[Cluster], list[TextCell]]:
|
||||
"""Main processing pipeline."""
|
||||
self.regular_clusters = self._process_regular_clusters()
|
||||
self.special_clusters = self._process_special_clusters()
|
||||
@@ -254,7 +253,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return final_clusters, self.cells
|
||||
|
||||
def _process_regular_clusters(self) -> List[Cluster]:
|
||||
def _process_regular_clusters(self) -> list[Cluster]:
|
||||
"""Process regular clusters with iterative refinement."""
|
||||
clusters = [
|
||||
c
|
||||
@@ -311,7 +310,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _process_special_clusters(self) -> List[Cluster]:
|
||||
def _process_special_clusters(self) -> list[Cluster]:
|
||||
special_clusters = [
|
||||
c
|
||||
for c in self.special_clusters
|
||||
@@ -381,7 +380,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return picture_clusters + wrapper_clusters
|
||||
|
||||
def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
|
||||
def _handle_cross_type_overlaps(self, special_clusters) -> list[Cluster]:
|
||||
"""Handle overlaps between regular and wrapper clusters before child assignment.
|
||||
|
||||
In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
|
||||
@@ -454,7 +453,7 @@ class LayoutPostprocessor:
|
||||
|
||||
def _select_best_cluster_from_group(
|
||||
self,
|
||||
group_clusters: List[Cluster],
|
||||
group_clusters: list[Cluster],
|
||||
params: dict,
|
||||
) -> Cluster:
|
||||
"""Select best cluster from a group of overlapping clusters based on all rules."""
|
||||
@@ -487,11 +486,11 @@ class LayoutPostprocessor:
|
||||
|
||||
def _remove_overlapping_clusters(
|
||||
self,
|
||||
clusters: List[Cluster],
|
||||
clusters: list[Cluster],
|
||||
cluster_type: str,
|
||||
overlap_threshold: float = 0.8,
|
||||
containment_threshold: float = 0.8,
|
||||
) -> List[Cluster]:
|
||||
) -> list[Cluster]:
|
||||
if not clusters:
|
||||
return []
|
||||
|
||||
@@ -544,7 +543,7 @@ class LayoutPostprocessor:
|
||||
|
||||
def _select_best_cluster(
|
||||
self,
|
||||
clusters: List[Cluster],
|
||||
clusters: list[Cluster],
|
||||
area_threshold: float,
|
||||
conf_threshold: float,
|
||||
) -> Cluster:
|
||||
@@ -572,7 +571,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return current_best if current_best else clusters[0]
|
||||
|
||||
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
def _deduplicate_cells(self, cells: list[TextCell]) -> list[TextCell]:
|
||||
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
||||
seen_ids = set()
|
||||
unique_cells = []
|
||||
@@ -583,8 +582,8 @@ class LayoutPostprocessor:
|
||||
return unique_cells
|
||||
|
||||
def _assign_cells_to_clusters(
|
||||
self, clusters: List[Cluster], min_overlap: float = 0.2
|
||||
) -> List[Cluster]:
|
||||
self, clusters: list[Cluster], min_overlap: float = 0.2
|
||||
) -> list[Cluster]:
|
||||
"""Assign cells to best overlapping cluster."""
|
||||
for cluster in clusters:
|
||||
cluster.cells = []
|
||||
@@ -616,7 +615,7 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
|
||||
def _find_unassigned_cells(self, clusters: list[Cluster]) -> list[TextCell]:
|
||||
"""Find cells not assigned to any cluster."""
|
||||
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
|
||||
return [
|
||||
@@ -625,7 +624,7 @@ class LayoutPostprocessor:
|
||||
if cell.index not in assigned and cell.text.strip()
|
||||
]
|
||||
|
||||
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
||||
def _adjust_cluster_bboxes(self, clusters: list[Cluster]) -> list[Cluster]:
|
||||
"""Adjust cluster bounding boxes to contain their cells."""
|
||||
for cluster in clusters:
|
||||
if not cluster.cells:
|
||||
@@ -651,13 +650,13 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
def _sort_cells(self, cells: list[TextCell]) -> list[TextCell]:
|
||||
"""Sort cells in native reading order."""
|
||||
return sorted(cells, key=lambda c: (c.index))
|
||||
|
||||
def _sort_clusters(
|
||||
self, clusters: List[Cluster], mode: str = "id"
|
||||
) -> List[Cluster]:
|
||||
self, clusters: list[Cluster], mode: str = "id"
|
||||
) -> list[Cluster]:
|
||||
"""Sort clusters in reading order (top-to-bottom, left-to-right)."""
|
||||
if mode == "id": # sort in the order the cells are printed in the PDF.
|
||||
return sorted(
|
||||
|
||||
287
tests/data/groundtruth/docling_v2/example_01_images.html.json
vendored
Normal file
287
tests/data/groundtruth/docling_v2/example_01_images.html.json
vendored
Normal file
File diff suppressed because one or more lines are too long
20
tests/data/groundtruth/docling_v2/example_01_images.html.md
vendored
Normal file
20
tests/data/groundtruth/docling_v2/example_01_images.html.md
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
# Introduction
|
||||
|
||||
This is the first paragraph of the introduction.
|
||||
|
||||
## Background
|
||||
|
||||
Some background information here.
|
||||
|
||||
Example image
|
||||
|
||||
<!-- image -->
|
||||
|
||||
- First item in unordered list
|
||||
- Second item in unordered list
|
||||
|
||||
1. First item in ordered list
|
||||
2. Second item in ordered list
|
||||
|
||||
42. First item in ordered list with start
|
||||
43. Second item in ordered list with start
|
||||
@@ -1,36 +0,0 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Introduction to parsing HTML files with Docling
|
||||
item-2 at level 2: picture
|
||||
item-2 at level 3: caption: Docling
|
||||
item-3 at level 2: text: Docling simplifies document proc ... ntegrations with the gen AI ecosystem.
|
||||
item-4 at level 2: section_header: Supported file formats
|
||||
item-5 at level 3: text: Docling supports multiple file formats..
|
||||
item-6 at level 3: list: group list
|
||||
item-7 at level 4: list_item: Advanced PDF understanding
|
||||
item-8 at level 4: picture
|
||||
item-8 at level 5: caption: PDF
|
||||
item-9 at level 4: list_item: Microsoft Office DOCX
|
||||
item-10 at level 4: picture
|
||||
item-10 at level 5: caption: DOCX
|
||||
item-11 at level 4: list_item: HTML files (with optional support for images)
|
||||
item-12 at level 4: picture
|
||||
item-12 at level 5: caption: HTML
|
||||
item-13 at level 3: section_header: Three backends for handling HTML files
|
||||
item-14 at level 4: text: Docling has three backends for parsing HTML files:
|
||||
item-15 at level 4: list: group ordered list
|
||||
item-16 at level 5: list_item:
|
||||
item-17 at level 6: inline: group group
|
||||
item-18 at level 7: text: HTMLDocumentBackend
|
||||
item-19 at level 7: text: Ignores images
|
||||
item-20 at level 5: list_item:
|
||||
item-21 at level 6: inline: group group
|
||||
item-22 at level 7: text: HTMLDocumentBackendImagesInline
|
||||
item-23 at level 7: text: Extracts images inline
|
||||
item-24 at level 5: list_item:
|
||||
item-25 at level 6: inline: group group
|
||||
item-26 at level 7: text: HTMLDocumentBackendImagesReferenced
|
||||
item-27 at level 7: text: Extracts images as references
|
||||
item-28 at level 1: caption: Docling
|
||||
item-29 at level 1: caption: PDF
|
||||
item-30 at level 1: caption: DOCX
|
||||
item-31 at level 1: caption: HTML
|
||||
@@ -1,560 +0,0 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "example_09",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 6785336133244366107,
|
||||
"filename": "example_09.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/13"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/19"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "ordered list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/13"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/14"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/15"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/3",
|
||||
"parent": {
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/17"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/18"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/texts/19"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/21"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/pictures/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Introduction to parsing HTML files with Docling",
|
||||
"text": "Introduction to parsing HTML files with Docling"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "Docling",
|
||||
"text": "Docling"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem.",
|
||||
"text": "Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/11"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Supported file formats",
|
||||
"text": "Supported file formats",
|
||||
"level": 1
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Docling supports multiple file formats..",
|
||||
"text": "Docling supports multiple file formats.."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Advanced PDF understanding",
|
||||
"text": "Advanced PDF understanding",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "PDF",
|
||||
"text": "PDF"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Microsoft Office DOCX",
|
||||
"text": "Microsoft Office DOCX",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "DOCX",
|
||||
"text": "DOCX"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "HTML files (with optional support for images)",
|
||||
"text": "HTML files (with optional support for images)",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "HTML",
|
||||
"text": "HTML"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/11",
|
||||
"parent": {
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/12"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Three backends for handling HTML files",
|
||||
"text": "Three backends for handling HTML files",
|
||||
"level": 2
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/12",
|
||||
"parent": {
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Docling has three backends for parsing HTML files:",
|
||||
"text": "Docling has three backends for parsing HTML files:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/13",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": true,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/14",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "HTMLDocumentBackend",
|
||||
"text": "HTMLDocumentBackend",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/15",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Ignores images",
|
||||
"text": "Ignores images"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": true,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/17",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "HTMLDocumentBackendImagesInline",
|
||||
"text": "HTMLDocumentBackendImagesInline",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Extracts images inline",
|
||||
"text": "Extracts images inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": true,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "HTMLDocumentBackendImagesReferenced",
|
||||
"text": "HTMLDocumentBackendImagesReferenced",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Extracts images as references",
|
||||
"text": "Extracts images as references"
|
||||
}
|
||||
],
|
||||
"pictures": [
|
||||
{
|
||||
"self_ref": "#/pictures/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/pictures/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
}
|
||||
],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/pictures/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/pictures/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
}
|
||||
],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
# Introduction to parsing HTML files with Docling
|
||||
|
||||
Docling
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Docling simplifies document processing, parsing diverse formats - including HTML - and providing seamless integrations with the gen AI ecosystem.
|
||||
|
||||
## Supported file formats
|
||||
|
||||
Docling supports multiple file formats..
|
||||
|
||||
- Advanced PDF understanding
|
||||
PDF
|
||||
|
||||
<!-- image -->
|
||||
- Microsoft Office DOCX
|
||||
DOCX
|
||||
|
||||
<!-- image -->
|
||||
- HTML files (with optional support for images)
|
||||
HTML
|
||||
|
||||
<!-- image -->
|
||||
|
||||
### Three backends for handling HTML files
|
||||
|
||||
Docling has three backends for parsing HTML files:
|
||||
|
||||
1. **HTMLDocumentBackend** Ignores images
|
||||
2. **HTMLDocumentBackendImagesInline** Extracts images inline
|
||||
3. **HTMLDocumentBackendImagesReferenced** Extracts images as references
|
||||
@@ -17,6 +17,12 @@
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
@@ -33,7 +39,7 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
"$ref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@@ -44,6 +50,18 @@
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "Image alt text",
|
||||
"text": "Image alt text"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
@@ -57,7 +75,26 @@
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"pictures": [
|
||||
{
|
||||
"self_ref": "#/pictures/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: caption: Image Hyperlink.
|
||||
item-1 at level 1: caption: Clickable Example
|
||||
item-2 at level 1: picture
|
||||
item-2 at level 2: caption: Image Hyperlink.
|
||||
item-2 at level 2: caption: Clickable Example
|
||||
item-3 at level 1: caption: This is an example caption for the image.
|
||||
item-4 at level 1: picture
|
||||
item-4 at level 2: caption: This is an example caption for the image.
|
||||
|
||||
@@ -66,8 +66,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "Image Hyperlink.",
|
||||
"text": "Image Hyperlink.",
|
||||
"orig": "Clickable Example",
|
||||
"text": "Clickable Example",
|
||||
"hyperlink": "https://www.example.com/"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
Image Hyperlink.
|
||||
Clickable Example
|
||||
|
||||
<!-- image -->
|
||||
|
||||
|
||||
2521
tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
vendored
2521
tests/data/groundtruth/docling_v2/wiki_duck.html.itxt
vendored
File diff suppressed because it is too large
Load Diff
10191
tests/data/groundtruth/docling_v2/wiki_duck.html.json
vendored
10191
tests/data/groundtruth/docling_v2/wiki_duck.html.json
vendored
File diff suppressed because it is too large
Load Diff
1114
tests/data/groundtruth/docling_v2/wiki_duck.html.md
vendored
1114
tests/data/groundtruth/docling_v2/wiki_duck.html.md
vendored
File diff suppressed because it is too large
Load Diff
2
tests/data/html/example_01.html
vendored
2
tests/data/html/example_01.html
vendored
@@ -4,7 +4,7 @@
|
||||
<p>This is the first paragraph of the introduction.</p>
|
||||
<h2>Background</h2>
|
||||
<p>Some background information here.</p>
|
||||
<img src="image1.png" alt="Example image"/>
|
||||
<img src="example_image_01.png" alt="Example image"/>
|
||||
<ul>
|
||||
<li>First item in unordered list</li>
|
||||
<li>Second item in unordered list</li>
|
||||
|
||||
21
tests/data/html/example_09.html
vendored
21
tests/data/html/example_09.html
vendored
@@ -1,21 +0,0 @@
|
||||
<html>
|
||||
<body>
|
||||
<h1>Introduction to parsing HTML files with <img src="https://docling-project.github.io/docling/assets/logo.png" alt="Docling" height="64"> Docling</h1>
|
||||
<p>Docling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.</p>
|
||||
<h2>Supported file formats</h2>
|
||||
<p>Docling supports multiple file formats..</p>
|
||||
<ul>
|
||||
<li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/pdf.png" height="32" alt="PDF">Advanced PDF understanding</li>
|
||||
<li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/docx.png" height="32" alt="DOCX">Microsoft Office DOCX</li>
|
||||
<li><img src="https://github.com/docling-project/docling/tree/main/docs/assets/html.png" height="32" alt="HTML">HTML files (with optional support for images)</li>
|
||||
</ul>
|
||||
<h3>Three backends for handling HTML files</h3>
|
||||
<p>Docling has three backends for parsing HTML files:</p>
|
||||
<ol>
|
||||
<li><b>HTMLDocumentBackend</b> Ignores images</li>
|
||||
<li><b>HTMLDocumentBackendImagesInline</b> Extracts images inline</li>
|
||||
<li><b>HTMLDocumentBackendImagesReferenced</b> Extracts images as references</li>
|
||||
</ol>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
BIN
tests/data/html/example_image_01.png
vendored
Normal file
BIN
tests/data/html/example_image_01.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 548 KiB |
@@ -1,9 +1,14 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from pathlib import Path, PurePath
|
||||
from unittest.mock import Mock, mock_open, patch
|
||||
|
||||
import pytest
|
||||
from docling_core.types.doc import PictureItem
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from pydantic import AnyUrl, ValidationError
|
||||
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.backend_options import HTMLBackendOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
@@ -11,7 +16,7 @@ from docling.datamodel.document import (
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.document_converter import DocumentConverter, HTMLFormatOption
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
@@ -19,6 +24,68 @@ from .verify_utils import verify_document, verify_export
|
||||
GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def test_html_backend_options():
|
||||
options = HTMLBackendOptions()
|
||||
assert options.kind == "html"
|
||||
assert not options.fetch_images
|
||||
assert options.source_uri is None
|
||||
|
||||
url = "http://example.com"
|
||||
source_location = AnyUrl(url=url)
|
||||
options = HTMLBackendOptions(source_uri=source_location)
|
||||
assert options.source_uri == source_location
|
||||
|
||||
source_location = PurePath("/local/path/to/file.html")
|
||||
options = HTMLBackendOptions(source_uri=source_location)
|
||||
assert options.source_uri == source_location
|
||||
|
||||
with pytest.raises(ValidationError, match="Input is not a valid path"):
|
||||
HTMLBackendOptions(source_uri=12345)
|
||||
|
||||
|
||||
def test_resolve_relative_path():
|
||||
html_path = Path("./tests/data/html/example_01.html")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=html_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
filename="test",
|
||||
)
|
||||
html_doc = HTMLDocumentBackend(path_or_stream=html_path, in_doc=in_doc)
|
||||
html_doc.base_path = "/local/path/to/file.html"
|
||||
|
||||
relative_path = "subdir/another.html"
|
||||
expected_abs_loc = "/local/path/to/subdir/another.html"
|
||||
assert html_doc._resolve_relative_path(relative_path) == expected_abs_loc
|
||||
|
||||
absolute_path = "/absolute/path/to/file.html"
|
||||
assert html_doc._resolve_relative_path(absolute_path) == absolute_path
|
||||
|
||||
html_doc.base_path = "http://my_host.com"
|
||||
protocol_relative_url = "//example.com/file.html"
|
||||
expected_abs_loc = "https://example.com/file.html"
|
||||
assert html_doc._resolve_relative_path(protocol_relative_url) == expected_abs_loc
|
||||
|
||||
html_doc.base_path = "http://example.com"
|
||||
remote_relative_path = "subdir/file.html"
|
||||
expected_abs_loc = "http://example.com/subdir/file.html"
|
||||
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
|
||||
|
||||
html_doc.base_path = "http://example.com"
|
||||
remote_relative_path = "https://my_host.com/my_page.html"
|
||||
expected_abs_loc = "https://my_host.com/my_page.html"
|
||||
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
|
||||
|
||||
html_doc.base_path = "http://example.com"
|
||||
remote_relative_path = "/static/images/my_image.png"
|
||||
expected_abs_loc = "http://example.com/static/images/my_image.png"
|
||||
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
|
||||
|
||||
html_doc.base_path = None
|
||||
relative_path = "subdir/file.html"
|
||||
assert html_doc._resolve_relative_path(relative_path) == relative_path
|
||||
|
||||
|
||||
def test_heading_levels():
|
||||
in_path = Path("tests/data/html/wiki_duck.html")
|
||||
in_doc = InputDocument(
|
||||
@@ -158,8 +225,6 @@ def test_e2e_html_conversions():
|
||||
converter = get_converter()
|
||||
|
||||
for html_path in html_paths:
|
||||
# print(f"converting {html_path}")
|
||||
|
||||
gt_path = (
|
||||
html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name
|
||||
)
|
||||
@@ -183,6 +248,76 @@ def test_e2e_html_conversions():
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||
|
||||
|
||||
@patch("docling.backend.html_backend.requests.get")
|
||||
@patch("docling.backend.html_backend.open", new_callable=mock_open)
|
||||
def test_e2e_html_conversion_with_images(mock_local, mock_remote):
|
||||
source = "tests/data/html/example_01.html"
|
||||
image_path = "tests/data/html/example_image_01.png"
|
||||
with open(image_path, "rb") as f:
|
||||
img_bytes = f.read()
|
||||
|
||||
# fetching image locally
|
||||
mock_local.return_value.__enter__.return_value = BytesIO(img_bytes)
|
||||
backend_options = HTMLBackendOptions(
|
||||
enable_local_fetch=True, fetch_images=True, source_uri=source
|
||||
)
|
||||
converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.HTML],
|
||||
format_options={
|
||||
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
|
||||
},
|
||||
)
|
||||
res_local = converter.convert(source)
|
||||
mock_local.assert_called_once()
|
||||
assert res_local.document
|
||||
num_pic: int = 0
|
||||
for element, _ in res_local.document.iterate_items():
|
||||
if isinstance(element, PictureItem):
|
||||
assert element.image
|
||||
num_pic += 1
|
||||
assert num_pic == 1, "No embedded picture was found in the converted file"
|
||||
|
||||
# fetching image remotely
|
||||
mock_resp = Mock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.content = img_bytes
|
||||
mock_remote.return_value = mock_resp
|
||||
source_location = "https://example.com/example_01.html"
|
||||
|
||||
backend_options = HTMLBackendOptions(
|
||||
enable_remote_fetch=True, fetch_images=True, source_uri=source_location
|
||||
)
|
||||
converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.HTML],
|
||||
format_options={
|
||||
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
|
||||
},
|
||||
)
|
||||
res_remote = converter.convert(source)
|
||||
mock_remote.assert_called_once_with(
|
||||
"https://example.com/example_image_01.png", stream=True
|
||||
)
|
||||
assert res_remote.document
|
||||
num_pic = 0
|
||||
for element, _ in res_remote.document.iterate_items():
|
||||
if isinstance(element, PictureItem):
|
||||
assert element.image
|
||||
assert element.image.mimetype == "image/png"
|
||||
num_pic += 1
|
||||
assert num_pic == 1, "No embedded picture was found in the converted file"
|
||||
|
||||
# both methods should generate the same DoclingDocument
|
||||
assert res_remote.document == res_local.document
|
||||
|
||||
# checking exported formats
|
||||
gt_path = (
|
||||
"tests/data/groundtruth/docling_v2/" + str(Path(source).stem) + "_images.html"
|
||||
)
|
||||
pred_md: str = res_local.document.export_to_markdown()
|
||||
assert verify_export(pred_md, gt_path + ".md", generate=GENERATE)
|
||||
assert verify_document(res_local.document, gt_path + ".json", GENERATE)
|
||||
|
||||
|
||||
def test_html_furniture():
|
||||
raw_html = (
|
||||
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
|
||||
@@ -211,3 +346,98 @@ def test_html_furniture():
|
||||
"Initial content with some **bold text**\n\n# Main Heading\n\nSome Content\n\n"
|
||||
"Some Footer Content"
|
||||
)
|
||||
|
||||
|
||||
def test_fetch_remote_images(monkeypatch):
|
||||
source = "./tests/data/html/example_01.html"
|
||||
|
||||
# no image fetching: the image_fetch flag is False
|
||||
backend_options = HTMLBackendOptions(
|
||||
fetch_images=False, source_uri="http://example.com"
|
||||
)
|
||||
converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.HTML],
|
||||
format_options={
|
||||
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
|
||||
},
|
||||
)
|
||||
with patch("docling.backend.html_backend.requests.get") as mocked_get:
|
||||
res = converter.convert(source)
|
||||
mocked_get.assert_not_called()
|
||||
assert res.document
|
||||
|
||||
# no image fetching: the source location is False and enable_local_fetch is False
|
||||
backend_options = HTMLBackendOptions(fetch_images=True)
|
||||
converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.HTML],
|
||||
format_options={
|
||||
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
|
||||
},
|
||||
)
|
||||
with (
|
||||
patch("docling.backend.html_backend.requests.get") as mocked_get,
|
||||
pytest.warns(
|
||||
match="Fetching local resources is only allowed when set explicitly"
|
||||
),
|
||||
):
|
||||
res = converter.convert(source)
|
||||
mocked_get.assert_not_called()
|
||||
assert res.document
|
||||
|
||||
# no image fetching: the enable_remote_fetch is False
|
||||
backend_options = HTMLBackendOptions(
|
||||
fetch_images=True, source_uri="http://example.com"
|
||||
)
|
||||
converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.HTML],
|
||||
format_options={
|
||||
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
|
||||
},
|
||||
)
|
||||
with (
|
||||
patch("docling.backend.html_backend.requests.get") as mocked_get,
|
||||
pytest.warns(
|
||||
match="Fetching remote resources is only allowed when set explicitly"
|
||||
),
|
||||
):
|
||||
res = converter.convert(source)
|
||||
mocked_get.assert_not_called()
|
||||
assert res.document
|
||||
|
||||
# image fetching: all conditions apply, source location is remote
|
||||
backend_options = HTMLBackendOptions(
|
||||
enable_remote_fetch=True, fetch_images=True, source_uri="http://example.com"
|
||||
)
|
||||
converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.HTML],
|
||||
format_options={
|
||||
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
|
||||
},
|
||||
)
|
||||
with (
|
||||
patch("docling.backend.html_backend.requests.get") as mocked_get,
|
||||
pytest.warns(match="a bytes-like object is required"),
|
||||
):
|
||||
res = converter.convert(source)
|
||||
mocked_get.assert_called_once()
|
||||
assert res.document
|
||||
|
||||
# image fetching: all conditions apply, local fetching allowed
|
||||
backend_options = HTMLBackendOptions(
|
||||
enable_local_fetch=True, fetch_images=True, source_uri=source
|
||||
)
|
||||
converter = DocumentConverter(
|
||||
allowed_formats=[InputFormat.HTML],
|
||||
format_options={
|
||||
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
|
||||
},
|
||||
)
|
||||
with (
|
||||
patch("docling.backend.html_backend.open") as mocked_open,
|
||||
pytest.warns(match="a bytes-like object is required"),
|
||||
):
|
||||
res = converter.convert(source)
|
||||
mocked_open.assert_called_once_with(
|
||||
"tests/data/html/example_image_01.png", "rb"
|
||||
)
|
||||
assert res.document
|
||||
|
||||
@@ -6,13 +6,12 @@ from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DoclingDocument,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from tests.verify_utils import CONFID_PREC, COORD_PREC
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
@@ -1,10 +1,19 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.backend_options import (
|
||||
BaseBackendOptions,
|
||||
DeclarativeBackendOptions,
|
||||
HTMLBackendOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
@@ -15,6 +24,7 @@ def test_in_doc_from_valid_path():
|
||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
doc = _make_input_doc(test_doc_path)
|
||||
assert doc.valid is True
|
||||
assert doc.backend_options is None
|
||||
|
||||
|
||||
def test_in_doc_from_invalid_path():
|
||||
@@ -105,6 +115,38 @@ def test_in_doc_with_page_range():
|
||||
assert doc.valid is False
|
||||
|
||||
|
||||
def test_in_doc_with_backend_options():
|
||||
test_doc_path = Path("./tests/data/html/example_01.html")
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=HTMLBackendOptions(),
|
||||
)
|
||||
assert doc.valid
|
||||
assert doc.backend_options
|
||||
assert isinstance(doc.backend_options, HTMLBackendOptions)
|
||||
assert not doc.backend_options.fetch_images
|
||||
assert not doc.backend_options.enable_local_fetch
|
||||
assert not doc.backend_options.enable_remote_fetch
|
||||
|
||||
with pytest.raises(ValueError, match="Incompatible types"):
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=DeclarativeBackendOptions(),
|
||||
)
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=BaseBackendOptions(),
|
||||
)
|
||||
|
||||
|
||||
def test_guess_format(tmp_path):
|
||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||
|
||||
Reference in New Issue
Block a user