mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
A new HTML backend that handles styled html (ignors it) as well as images.
Note: MyPy fails. Seems to be a known issue with BeautifulSoup: https://github.com/python/typeshed/pull/13604 - Fixed issues with handling nested lists. - Fixed some issues with spaces between text fragments - Change naming of image configuration from INLINE to EMBEDDED. Also renamed corresponding class. - Introduced constat for default image width- / height. Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
This commit is contained in:
parent
733360c7b2
commit
5d08b749af
@ -7,6 +7,7 @@ from pathlib import Path
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from PIL import Image, UnidentifiedImageError
|
||||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -17,8 +18,7 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import ContentLayer, ImageRef
|
from docling_core.types.doc.document import ContentLayer, ImageRef
|
||||||
from PIL import Image, UnidentifiedImageError
|
from pydantic import AnyUrl, ValidationError
|
||||||
from pydantic import AnyUrl, HttpUrl, ValidationError
|
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
@ -27,15 +27,17 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_IMAGE_WIDTH = 128
|
||||||
|
DEFAULT_IMAGE_HEIGHT = 128
|
||||||
|
|
||||||
# Tags that initiate distinct Docling items
|
# Tags that initiate distinct Docling items
|
||||||
_BLOCK_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table"}
|
_BLOCK_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "table"}
|
||||||
|
|
||||||
|
|
||||||
class ImageOptions(str, Enum):
|
class ImageOptions(str, Enum):
|
||||||
"""Image options for HTML backend."""
|
"""Image options for HTML backend."""
|
||||||
|
|
||||||
NONE = "none"
|
NONE = "none"
|
||||||
INLINE = "inline"
|
EMBEDDED = "embedded"
|
||||||
REFERENCED = "referenced"
|
REFERENCED = "referenced"
|
||||||
|
|
||||||
|
|
||||||
@ -49,7 +51,6 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
):
|
):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.image_options = image_options
|
self.image_options = image_options
|
||||||
self.soup: Optional[Tag] = None
|
|
||||||
try:
|
try:
|
||||||
raw = (
|
raw = (
|
||||||
path_or_stream.getvalue()
|
path_or_stream.getvalue()
|
||||||
@ -88,35 +89,27 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
_log.debug("Starting HTML conversion...")
|
title = self.soup.find("title")
|
||||||
if not self.is_valid():
|
if title:
|
||||||
raise RuntimeError("Invalid HTML document.")
|
doc.add_title(title.get_text())
|
||||||
assert self.soup is not None
|
# remove scripts/styles
|
||||||
|
|
||||||
# Remove all script/style content
|
|
||||||
for tag in self.soup.find_all(["script", "style"]):
|
for tag in self.soup.find_all(["script", "style"]):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
|
||||||
body = self.soup.body or self.soup
|
body = self.soup.body or self.soup
|
||||||
# Normalize <br> tags to newline strings
|
# normalize <br>
|
||||||
for br in body.find_all("br"):
|
for br in body.find_all("br"):
|
||||||
br.replace_with(NavigableString("\n"))
|
br.replace_with(NavigableString("\n"))
|
||||||
|
|
||||||
# Decide content layer by presence of headers
|
|
||||||
headers = body.find(list(_BLOCK_TAGS))
|
headers = body.find(list(_BLOCK_TAGS))
|
||||||
self.content_layer = (
|
self.content_layer = (
|
||||||
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
||||||
)
|
)
|
||||||
|
|
||||||
# Walk the body to build the DoclingDocument
|
|
||||||
self._walk(body, doc, parent=doc.body)
|
self._walk(body, doc, parent=doc.body)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _walk(self, element: Tag, doc: DoclingDocument, parent) -> None:
|
def _walk(self, element: Tag, doc: DoclingDocument, parent) -> None:
|
||||||
"""
|
|
||||||
Recursively walk element.contents, buffering inline text across tags like <b> or <span>,
|
|
||||||
emitting text nodes only at block boundaries, and extracting images immediately.
|
|
||||||
"""
|
|
||||||
buffer: list[str] = []
|
buffer: list[str] = []
|
||||||
|
|
||||||
def flush_buffer():
|
def flush_buffer():
|
||||||
@ -126,88 +119,93 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
buffer.clear()
|
buffer.clear()
|
||||||
if not text:
|
if not text:
|
||||||
return
|
return
|
||||||
# Split on newlines for <br>
|
|
||||||
for part in text.split("\n"):
|
for part in text.split("\n"):
|
||||||
seg = part.strip()
|
seg = part.strip()
|
||||||
if seg:
|
if seg:
|
||||||
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
|
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
|
||||||
|
|
||||||
for node in element.contents:
|
for node in element.contents:
|
||||||
# Skip scripts/styles
|
|
||||||
if isinstance(node, Tag) and node.name.lower() in ("script", "style"):
|
if isinstance(node, Tag) and node.name.lower() in ("script", "style"):
|
||||||
continue
|
continue
|
||||||
# Immediate image extraction
|
|
||||||
if isinstance(node, Tag) and node.name.lower() == "img":
|
if isinstance(node, Tag) and node.name.lower() == "img":
|
||||||
flush_buffer()
|
flush_buffer()
|
||||||
self._emit_image(node, doc, parent)
|
self._emit_image(node, doc, parent)
|
||||||
continue
|
continue
|
||||||
# Block-level element triggers flush + handle
|
|
||||||
if isinstance(node, Tag) and node.name.lower() in _BLOCK_TAGS:
|
if isinstance(node, Tag) and node.name.lower() in _BLOCK_TAGS:
|
||||||
flush_buffer()
|
flush_buffer()
|
||||||
self._handle_block(node, doc, parent)
|
self._handle_block(node, doc, parent)
|
||||||
# Inline tag with nested blocks: recurse
|
|
||||||
elif isinstance(node, Tag) and node.find(list(_BLOCK_TAGS)):
|
elif isinstance(node, Tag) and node.find(list(_BLOCK_TAGS)):
|
||||||
flush_buffer()
|
flush_buffer()
|
||||||
self._walk(node, doc, parent)
|
self._walk(node, doc, parent)
|
||||||
# Inline text
|
|
||||||
elif isinstance(node, Tag):
|
elif isinstance(node, Tag):
|
||||||
buffer.append(node.get_text())
|
buffer.append(node.get_text())
|
||||||
elif isinstance(node, NavigableString):
|
elif isinstance(node, NavigableString):
|
||||||
buffer.append(str(node))
|
buffer.append(str(node))
|
||||||
|
|
||||||
# Flush any remaining text
|
|
||||||
flush_buffer()
|
flush_buffer()
|
||||||
|
|
||||||
def _handle_block(self, tag: Tag, doc: DoclingDocument, parent) -> None:
|
def _handle_block(self, tag: Tag, doc: DoclingDocument, parent) -> None:
|
||||||
tag_name = tag.name.lower()
|
tag_name = tag.name.lower()
|
||||||
if tag_name == "h1":
|
|
||||||
text = tag.get_text(strip=True)
|
if tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
||||||
if text:
|
|
||||||
doc.add_title(text, parent=parent)
|
|
||||||
for img_tag in tag.find_all("img", recursive=True):
|
|
||||||
self._emit_image(img_tag, doc, parent)
|
|
||||||
elif tag_name in {"h2", "h3", "h4", "h5", "h6"}:
|
|
||||||
level = int(tag_name[1])
|
level = int(tag_name[1])
|
||||||
text = tag.get_text(strip=True)
|
text = tag.get_text(strip=False)
|
||||||
if text:
|
if text:
|
||||||
doc.add_heading(text, level=level, parent=parent)
|
doc.add_heading(text.strip(), level=level, parent=parent)
|
||||||
for img_tag in tag.find_all("img", recursive=True):
|
for img_tag in tag.find_all("img", recursive=True):
|
||||||
self._emit_image(img_tag, doc, parent)
|
self._emit_image(img_tag, doc, parent)
|
||||||
|
|
||||||
elif tag_name == "p":
|
elif tag_name == "p":
|
||||||
for part in tag.get_text().split("\n"):
|
for part in tag.get_text().split("\n"):
|
||||||
seg = part.strip()
|
seg = part.strip()
|
||||||
if seg:
|
if seg:
|
||||||
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
|
doc.add_text(DocItemLabel.TEXT, seg, parent=parent)
|
||||||
for img_tag in tag.find_all("img", recursive=True):
|
for img_tag in tag.find_all("img", recursive=True):
|
||||||
self._emit_image(img_tag, doc, parent)
|
self._emit_image(img_tag, doc, parent)
|
||||||
|
|
||||||
elif tag_name in {"ul", "ol"}:
|
elif tag_name in {"ul", "ol"}:
|
||||||
is_ordered = tag_name == "ol"
|
is_ordered = (tag_name == "ol")
|
||||||
group = (
|
# Create the list container
|
||||||
|
list_group = (
|
||||||
doc.add_ordered_list(parent=parent)
|
doc.add_ordered_list(parent=parent)
|
||||||
if is_ordered
|
if is_ordered
|
||||||
else doc.add_unordered_list(parent=parent)
|
else doc.add_unordered_list(parent=parent)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# For each top-level <li> in this list
|
||||||
for li in tag.find_all("li", recursive=False):
|
for li in tag.find_all("li", recursive=False):
|
||||||
li_text = li.get_text(separator=" ", strip=True)
|
# 1) extract only the "direct" text from this <li>
|
||||||
|
parts: list[str] = []
|
||||||
|
for child in li.contents:
|
||||||
|
if isinstance(child, NavigableString):
|
||||||
|
text_part = child.strip()
|
||||||
|
if text_part:
|
||||||
|
parts.append(text_part)
|
||||||
|
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
|
||||||
|
text_part = child.get_text(separator=" ", strip=True)
|
||||||
|
if text_part:
|
||||||
|
parts.append(text_part)
|
||||||
|
li_text = " ".join(parts)
|
||||||
|
|
||||||
|
# 2) add the list item
|
||||||
li_item = doc.add_list_item(
|
li_item = doc.add_list_item(
|
||||||
text=li_text, enumerated=is_ordered, parent=group
|
text=li_text, enumerated=is_ordered, parent=list_group
|
||||||
)
|
)
|
||||||
# Nested lists inside <li>
|
|
||||||
for sub in li.find_all(["ul", "ol"], recursive=False):
|
# 3) recurse into any nested lists, attaching them to this <li> item
|
||||||
self._handle_block(sub, doc, parent=group)
|
for sublist in li.find_all(["ul", "ol"], recursive=False):
|
||||||
|
self._handle_block(sublist, doc, parent=li_item)
|
||||||
|
|
||||||
|
# 4) extract any images under this <li>
|
||||||
for img_tag in li.find_all("img", recursive=True):
|
for img_tag in li.find_all("img", recursive=True):
|
||||||
self._emit_image(img_tag, doc, li_item)
|
self._emit_image(img_tag, doc, li_item)
|
||||||
|
|
||||||
elif tag_name == "table":
|
elif tag_name == "table":
|
||||||
# Add table item and extract nested images
|
|
||||||
data = self._parse_table(tag, doc, parent)
|
data = self._parse_table(tag, doc, parent)
|
||||||
doc.add_table(data=data, parent=parent)
|
doc.add_table(data=data, parent=parent)
|
||||||
|
|
||||||
def _emit_image(self, img_tag: Tag, doc: DoclingDocument, parent) -> None:
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument, parent) -> None:
|
||||||
"""
|
if self.image_options == ImageOptions.NONE:
|
||||||
Helper to create a PictureItem (with optional CAPTION) for an <img> tag.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if ImageOptions.NONE == self.image_options:
|
|
||||||
return
|
return
|
||||||
|
|
||||||
alt = (img_tag.get("alt") or "").strip()
|
alt = (img_tag.get("alt") or "").strip()
|
||||||
@ -215,46 +213,40 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if alt:
|
if alt:
|
||||||
caption_item = doc.add_text(DocItemLabel.CAPTION, alt, parent=parent)
|
caption_item = doc.add_text(DocItemLabel.CAPTION, alt, parent=parent)
|
||||||
|
|
||||||
src_url = img_tag.get("src")
|
src_url = img_tag.get("src", "")
|
||||||
width = img_tag.get("width", "128")
|
width = img_tag.get("width", str(DEFAULT_IMAGE_WIDTH))
|
||||||
height = img_tag.get("height", "128")
|
height = img_tag.get("height", str(DEFAULT_IMAGE_HEIGHT))
|
||||||
img_ref = None
|
img_ref: Optional[ImageRef] = None
|
||||||
if ImageOptions.INLINE == self.image_options:
|
|
||||||
|
if self.image_options == ImageOptions.EMBEDDED:
|
||||||
try:
|
try:
|
||||||
if src_url.startswith("http"):
|
if src_url.startswith("http"):
|
||||||
img = Image.open(requests.get(src_url, stream=True).raw)
|
img = Image.open(requests.get(src_url, stream=True).raw)
|
||||||
elif src_url.startswith("file:"):
|
|
||||||
img = Image.open(src_url)
|
|
||||||
elif src_url.startswith("data:"):
|
elif src_url.startswith("data:"):
|
||||||
image_data = re.sub("^data:image/.+;base64,", "", src_url)
|
data = re.sub(r"^data:image/.+;base64,", "", src_url)
|
||||||
img = Image.open(BytesIO(base64.b64decode(image_data)))
|
img = Image.open(BytesIO(base64.b64decode(data)))
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi")[0]))
|
img_ref = ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
|
||||||
except (FileNotFoundError, UnidentifiedImageError) as ve:
|
except (FileNotFoundError, UnidentifiedImageError) as e:
|
||||||
_log.warning(f"Could not load image (src={src_url}): {ve}")
|
_log.warning(f"Could not load image (src={src_url}): {e}")
|
||||||
return
|
return
|
||||||
elif ImageOptions.REFERENCED == self.image_options:
|
|
||||||
|
elif self.image_options == ImageOptions.REFERENCED:
|
||||||
try:
|
try:
|
||||||
img_url = AnyUrl(src_url)
|
|
||||||
img_ref = ImageRef(
|
img_ref = ImageRef(
|
||||||
uri=img_url,
|
uri=AnyUrl(src_url),
|
||||||
dpi=72,
|
dpi=72,
|
||||||
mimetype="image/png",
|
mimetype="image/png",
|
||||||
size=Size(width=float(width), height=float(height)),
|
size=Size(width=float(width), height=float(height)),
|
||||||
)
|
)
|
||||||
except ValidationError as ve:
|
except ValidationError as e:
|
||||||
_log.warning(f"Could not load image (src={src_url}): {ve}")
|
_log.warning(f"Could not load image (src={src_url}): {e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
doc.add_picture(image=img_ref, caption=caption_item, parent=parent)
|
doc.add_picture(image=img_ref, caption=caption_item, parent=parent)
|
||||||
|
|
||||||
def _parse_table(self, table_tag: Tag, doc: DoclingDocument, parent) -> TableData:
|
def _parse_table(self, table_tag: Tag, doc: DoclingDocument, parent) -> TableData:
|
||||||
"""
|
|
||||||
Convert an HTML table into TableData, capturing cell spans and text,
|
|
||||||
and emitting any nested images as PictureItems.
|
|
||||||
"""
|
|
||||||
# Build TableData
|
|
||||||
rows = []
|
rows = []
|
||||||
for sec in ("thead", "tbody", "tfoot"):
|
for sec in ("thead", "tbody", "tfoot"):
|
||||||
section = table_tag.find(sec)
|
section = table_tag.find(sec)
|
||||||
@ -262,9 +254,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
rows.extend(section.find_all("tr", recursive=False))
|
rows.extend(section.find_all("tr", recursive=False))
|
||||||
if not rows:
|
if not rows:
|
||||||
rows = table_tag.find_all("tr", recursive=False)
|
rows = table_tag.find_all("tr", recursive=False)
|
||||||
|
|
||||||
occupied: dict[tuple[int, int], bool] = {}
|
occupied: dict[tuple[int, int], bool] = {}
|
||||||
cells: list[TableCell] = []
|
cells: list[TableCell] = []
|
||||||
max_cols = 0
|
max_cols = 0
|
||||||
|
|
||||||
for r, tr in enumerate(rows):
|
for r, tr in enumerate(rows):
|
||||||
c = 0
|
c = 0
|
||||||
for cell_tag in tr.find_all(("td", "th"), recursive=False):
|
for cell_tag in tr.find_all(("td", "th"), recursive=False):
|
||||||
@ -292,9 +286,11 @@ class BaseHTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
occupied[(r + dr, c + dc)] = True
|
occupied[(r + dr, c + dc)] = True
|
||||||
c += cs
|
c += cs
|
||||||
max_cols = max(max_cols, c)
|
max_cols = max(max_cols, c)
|
||||||
# Emit images inside this table
|
|
||||||
|
# emit any images in the table
|
||||||
for img_tag in table_tag.find_all("img", recursive=True):
|
for img_tag in table_tag.find_all("img", recursive=True):
|
||||||
self._emit_image(img_tag, doc, parent)
|
self._emit_image(img_tag, doc, parent)
|
||||||
|
|
||||||
return TableData(table_cells=cells, num_rows=len(rows), num_cols=max_cols)
|
return TableData(table_cells=cells, num_rows=len(rows), num_cols=max_cols)
|
||||||
|
|
||||||
|
|
||||||
@ -308,14 +304,14 @@ class HTMLDocumentBackend(BaseHTMLDocumentBackend):
|
|||||||
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.NONE)
|
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.NONE)
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocumentBackendImagesInline(BaseHTMLDocumentBackend):
|
class HTMLDocumentBackendImagesEmbedded(BaseHTMLDocumentBackend):
|
||||||
@override
|
@override
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
in_doc: InputDocument,
|
in_doc: InputDocument,
|
||||||
path_or_stream: Union[BytesIO, Path],
|
path_or_stream: Union[BytesIO, Path],
|
||||||
):
|
):
|
||||||
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.INLINE)
|
super().__init__(in_doc, path_or_stream, image_options=ImageOptions.EMBEDDED)
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocumentBackendImagesReferenced(BaseHTMLDocumentBackend):
|
class HTMLDocumentBackendImagesReferenced(BaseHTMLDocumentBackend):
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user