mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
A new HTML backend that handles styled html (ignors it) as well as images.
Note: MyPy fails. Seems to be a known issue with BeautifulSoup: https://github.com/python/typeshed/pull/13604 - Fixed issues with handling nested lists. - Fixed some issues with spaces between text fragments - Change naming of image configuration from INLINE to EMBEDDED. Also renamed corresponding class. - Introduced constat for default image width- / height. Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
This commit is contained in:
parent
733360c7b2
commit
5d08b749af
@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
import requests
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@ -17,8 +18,7 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer, ImageRef
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from pydantic import AnyUrl, HttpUrl, ValidationError
|
||||
from pydantic import AnyUrl, ValidationError
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
@ -27,15 +27,17 @@ from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_IMAGE_WIDTH = 128
|
||||
DEFAULT_IMAGE_HEIGHT = 128
|
||||
|
||||
# Tags that initiate distinct Docling items
|
||||
_BLOCK_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table"}
|
||||
|
||||
|
||||
class ImageOptions(str, Enum):
|
||||
"""Image options for HTML backend."""
|
||||
|
||||
NONE = "none"
|
||||
INLINE = "inline"
|
||||
EMBEDDED = "embedded"
|
||||
REFERENCED = "referenced"
|
||||
|
||||
|
||||
@ -49,7 +51,6 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.image_options = image_options
|
||||
self.soup: Optional[Tag] = None
|
||||
try:
|
||||
raw = (
|
||||
path_or_stream.getvalue()
|
||||
@ -88,35 +89,27 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
_log.debug("Starting HTML conversion...")
|
||||
if not self.is_valid():
|
||||
raise RuntimeError("Invalid HTML document.")
|
||||
assert self.soup is not None
|
||||
|
||||
# Remove all script/style content
|
||||
title = self.soup.find("title")
|
||||
if title:
|
||||
doc.add_title(title.get_text())
|
||||
# remove scripts/styles
|
||||
for tag in self.soup.find_all(["script", "style"]):
|
||||
tag.decompose()
|
||||
|
||||
body = self.soup.body or self.soup
|
||||
# Normalize <br> tags to newline strings
|
||||
# normalize <br>
|
||||
for br in body.find_all("br"):
|
||||
br.replace_with(NavigableString("\n"))
|
||||
|
||||
# Decide content layer by presence of headers
|
||||
headers = body.find(list(_BLOCK_TAGS))
|
||||
self.content_layer = (
|
||||
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
||||
)
|
||||
|
||||
# Walk the body to build the DoclingDocument
|
||||
self._walk(body, doc, parent=doc.body)
|
||||
return doc
|
||||
|
||||
def _walk(self, element: Tag, doc: DoclingDocument, parent) -> None:
|
||||
"""
|
||||
Recursively walk element.contents, buffering inline text across tags like <b> or <span>,
|
||||
emitting text nodes only at block boundaries, and extracting images immediately.
|
||||
"""
|
||||
buffer: list[str] = []
|
||||
|
||||
def flush_buffer():
|
||||
@ -126,88 +119,93 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
buffer.clear()
|
||||
if not text:
|
||||
return
|
||||
# Split on newlines for <br>
|
||||
for part in text.split("\n"):
|
||||
seg = part.strip()
|
||||
if seg:
|
||||
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
|
||||
|
||||
for node in element.contents:
|
||||
# Skip scripts/styles
|
||||
if isinstance(node, Tag) and node.name.lower() in ("script", "style"):
|
||||
continue
|
||||
# Immediate image extraction
|
||||
if isinstance(node, Tag) and node.name.lower() == "img":
|
||||
flush_buffer()
|
||||
self._emit_image(node, doc, parent)
|
||||
continue
|
||||
# Block-level element triggers flush + handle
|
||||
if isinstance(node, Tag) and node.name.lower() in _BLOCK_TAGS:
|
||||
flush_buffer()
|
||||
self._handle_block(node, doc, parent)
|
||||
# Inline tag with nested blocks: recurse
|
||||
elif isinstance(node, Tag) and node.find(list(_BLOCK_TAGS)):
|
||||
flush_buffer()
|
||||
self._walk(node, doc, parent)
|
||||
# Inline text
|
||||
elif isinstance(node, Tag):
|
||||
buffer.append(node.get_text())
|
||||
elif isinstance(node, NavigableString):
|
||||
buffer.append(str(node))
|
||||
|
||||
# Flush any remaining text
|
||||
flush_buffer()
|
||||
|
||||
def _handle_block(self, tag: Tag, doc: DoclingDocument, parent) -> None:
|
||||
tag_name = tag.name.lower()
|
||||
if tag_name == "h1":
|
||||
text = tag.get_text(strip=True)
|
||||
if text:
|
||||
doc.add_title(text, parent=parent)
|
||||
for img_tag in tag.find_all("img", recursive=True):
|
||||
self._emit_image(img_tag, doc, parent)
|
||||
elif tag_name in {"h2", "h3", "h4", "h5", "h6"}:
|
||||
|
||||
if tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
||||
level = int(tag_name[1])
|
||||
text = tag.get_text(strip=True)
|
||||
text = tag.get_text(strip=False)
|
||||
if text:
|
||||
doc.add_heading(text, level=level, parent=parent)
|
||||
doc.add_heading(text.strip(), level=level, parent=parent)
|
||||
for img_tag in tag.find_all("img", recursive=True):
|
||||
self._emit_image(img_tag, doc, parent)
|
||||
|
||||
elif tag_name == "p":
|
||||
for part in tag.get_text().split("\n"):
|
||||
seg = part.strip()
|
||||
if seg:
|
||||
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
|
||||
for img_tag in tag.find_all("img", recursive=True):
|
||||
self._emit_image(img_tag, doc, parent)
|
||||
for img_tag in tag.find_all("img", recursive=True):
|
||||
self._emit_image(img_tag, doc, parent)
|
||||
|
||||
elif tag_name in {"ul", "ol"}:
|
||||
is_ordered = tag_name == "ol"
|
||||
group = (
|
||||
is_ordered = (tag_name == "ol")
|
||||
# Create the list container
|
||||
list_group = (
|
||||
doc.add_ordered_list(parent=parent)
|
||||
if is_ordered
|
||||
else doc.add_unordered_list(parent=parent)
|
||||
)
|
||||
|
||||
# For each top-level <li> in this list
|
||||
for li in tag.find_all("li", recursive=False):
|
||||
li_text = li.get_text(separator=" ", strip=True)
|
||||
# 1) extract only the "direct" text from this <li>
|
||||
parts: list[str] = []
|
||||
for child in li.contents:
|
||||
if isinstance(child, NavigableString):
|
||||
text_part = child.strip()
|
||||
if text_part:
|
||||
parts.append(text_part)
|
||||
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
|
||||
text_part = child.get_text(separator=" ", strip=True)
|
||||
if text_part:
|
||||
parts.append(text_part)
|
||||
li_text = " ".join(parts)
|
||||
|
||||
# 2) add the list item
|
||||
li_item = doc.add_list_item(
|
||||
text=li_text, enumerated=is_ordered, parent=group
|
||||
text=li_text, enumerated=is_ordered, parent=list_group
|
||||
)
|
||||
# Nested lists inside <li>
|
||||
for sub in li.find_all(["ul", "ol"], recursive=False):
|
||||
self._handle_block(sub, doc, parent=group)
|
||||
|
||||
# 3) recurse into any nested lists, attaching them to this <li> item
|
||||
for sublist in li.find_all(["ul", "ol"], recursive=False):
|
||||
self._handle_block(sublist, doc, parent=li_item)
|
||||
|
||||
# 4) extract any images under this <li>
|
||||
for img_tag in li.find_all("img", recursive=True):
|
||||
self._emit_image(img_tag, doc, li_item)
|
||||
|
||||
elif tag_name == "table":
|
||||
# Add table item and extract nested images
|
||||
data = self._parse_table(tag, doc, parent)
|
||||
doc.add_table(data=data, parent=parent)
|
||||
|
||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument, parent) -> None:
|
||||
"""
|
||||
Helper to create a PictureItem (with optional CAPTION) for an <img> tag.
|
||||
"""
|
||||
|
||||
if ImageOptions.NONE == self.image_options:
|
||||
if self.image_options == ImageOptions.NONE:
|
||||
return
|
||||
|
||||
alt = (img_tag.get("alt") or "").strip()
|
||||
@ -215,46 +213,40 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
if alt:
|
||||
caption_item = doc.add_text(DocItemLabel.CAPTION, alt, parent=parent)
|
||||
|
||||
src_url = img_tag.get("src")
|
||||
width = img_tag.get("width", "128")
|
||||
height = img_tag.get("height", "128")
|
||||
img_ref = None
|
||||
if ImageOptions.INLINE == self.image_options:
|
||||
src_url = img_tag.get("src", "")
|
||||
width = img_tag.get("width", str(DEFAULT_IMAGE_WIDTH))
|
||||
height = img_tag.get("height", str(DEFAULT_IMAGE_HEIGHT))
|
||||
img_ref: Optional[ImageRef] = None
|
||||
|
||||
if self.image_options == ImageOptions.EMBEDDED:
|
||||
try:
|
||||
if src_url.startswith("http"):
|
||||
img = Image.open(requests.get(src_url, stream=True).raw)
|
||||
elif src_url.startswith("file:"):
|
||||
img = Image.open(src_url)
|
||||
elif src_url.startswith("data:"):
|
||||
image_data = re.sub("^data:image/.+;base64,", "", src_url)
|
||||
img = Image.open(BytesIO(base64.b64decode(image_data)))
|
||||
data = re.sub(r"^data:image/.+;base64,", "", src_url)
|
||||
img = Image.open(BytesIO(base64.b64decode(data)))
|
||||
else:
|
||||
return
|
||||
img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi")[0]))
|
||||
except (FileNotFoundError, UnidentifiedImageError) as ve:
|
||||
_log.warning(f"Could not load image (src={src_url}): {ve}")
|
||||
img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
|
||||
except (FileNotFoundError, UnidentifiedImageError) as e:
|
||||
_log.warning(f"Could not load image (src={src_url}): {e}")
|
||||
return
|
||||
elif ImageOptions.REFERENCED == self.image_options:
|
||||
|
||||
elif self.image_options == ImageOptions.REFERENCED:
|
||||
try:
|
||||
img_url = AnyUrl(src_url)
|
||||
img_ref = ImageRef(
|
||||
uri=img_url,
|
||||
uri=AnyUrl(src_url),
|
||||
dpi=72,
|
||||
mimetype="image/png",
|
||||
size=Size(width=float(width), height=float(height)),
|
||||
)
|
||||
except ValidationError as ve:
|
||||
_log.warning(f"Could not load image (src={src_url}): {ve}")
|
||||
except ValidationError as e:
|
||||
_log.warning(f"Could not load image (src={src_url}): {e}")
|
||||
return
|
||||
|
||||
doc.add_picture(image=img_ref, caption=caption_item, parent=parent)
|
||||
|
||||
def _parse_table(self, table_tag: Tag, doc: DoclingDocument, parent) -> TableData:
|
||||
"""
|
||||
Convert an HTML table into TableData, capturing cell spans and text,
|
||||
and emitting any nested images as PictureItems.
|
||||
"""
|
||||
# Build TableData
|
||||
rows = []
|
||||
for sec in ("thead", "tbody", "tfoot"):
|
||||
section = table_tag.find(sec)
|
||||
@ -262,9 +254,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
rows.extend(section.find_all("tr", recursive=False))
|
||||
if not rows:
|
||||
rows = table_tag.find_all("tr", recursive=False)
|
||||
|
||||
occupied: dict[tuple[int, int], bool] = {}
|
||||
cells: list[TableCell] = []
|
||||
max_cols = 0
|
||||
|
||||
for r, tr in enumerate(rows):
|
||||
c = 0
|
||||
for cell_tag in tr.find_all(("td", "th"), recursive=False):
|
||||
@ -292,9 +286,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
occupied[(r + dr, c + dc)] = True
|
||||
c += cs
|
||||
max_cols = max(max_cols, c)
|
||||
# Emit images inside this table
|
||||
|
||||
# emit any images in the table
|
||||
for img_tag in table_tag.find_all("img", recursive=True):
|
||||
self._emit_image(img_tag, doc, parent)
|
||||
|
||||
return TableData(table_cells=cells, num_rows=len(rows), num_cols=max_cols)
|
||||
|
||||
|
||||
@ -308,14 +304,14 @@ class HTMLDocumentBackend(BaseHTMLDocumentBackend):
|
||||
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.NONE)
|
||||
|
||||
|
||||
class HTMLDocumentBackendImagesInline(BaseHTMLDocumentBackend):
|
||||
class HTMLDocumentBackendImagesEmbedded(BaseHTMLDocumentBackend):
|
||||
@override
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: InputDocument,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.INLINE)
|
||||
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.EMBEDDED)
|
||||
|
||||
|
||||
class HTMLDocumentBackendImagesReferenced(BaseHTMLDocumentBackend):
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user