Added HTML backend implementation, few improvements for other backends

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-08 11:14:44 +02:00
parent f773d8a621
commit 89e58ca730
6 changed files with 420 additions and 34 deletions

View File

@ -1,26 +1,49 @@
import logging
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from bs4 import BeautifulSoup
from docling_core.types.experimental import ( from docling_core.types.experimental import (
BasePictureData,
BaseTableData,
DescriptionItem, DescriptionItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin,
ImageRef,
PictureItem,
SectionHeaderItem,
TableCell,
TableItem,
) )
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
_log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self.soup = None
# HTML file:
self.path_or_stream = path_or_stream
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level = 0
self.parents = {}
for i in range(0, self.max_levels):
self.parents[i] = None
self.labels = {}
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
def is_paginated(cls) -> bool: def is_paginated(cls) -> bool:
False return False
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
@ -33,8 +56,338 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return {InputFormat.HTML} return {InputFormat.HTML}
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
try:
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
doc = self.walk(self.soup.body, doc)
return doc return doc
def walk(self, element, doc):
try:
# Iterate over elements in the body of the document
for idx, element in enumerate(element.children):
try:
self.analyse_element(element, idx, doc)
except Exception as exc_child:
_log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n")
pass
except Exception as exc:
pass
return doc
def analyse_element(self, element, idx, doc):
"""
if element.name!=None:
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
"""
if element.name in self.labels:
self.labels[element.name] += 1
else:
self.labels[element.name] = 1
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
self.handle_listitem(element, idx, doc)
elif element.name == "table":
self.handle_table(element, idx, doc)
elif element.name == "figure":
self.handle_figure(element, idx, doc)
elif element.name == "img":
self.handle_image(element, idx, doc)
else:
self.walk(element, doc)
def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False)
if isinstance(text, str):
return text.strip()
return ""
# Function to recursively extract text from all child nodes
def extract_text_recursively(self, item):
result = []
if isinstance(item, str):
return [item]
result.append(self.get_direct_text(item))
try:
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
return " ".join(result)
def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
slevel = hlevel - 1
label = DocItemLabel.SECTION_HEADER
text = element.text.strip()
if hlevel == 1:
for key, val in self.parents.items():
self.parents[key] = None
self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
)
elif hlevel == self.level:
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
elif hlevel > self.level:
# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
)
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
self.level = hlevel
elif hlevel < self.level:
# remove the tail
for key, val in self.parents.items():
if key > hlevel:
self.parents[key] = None
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
self.level = hlevel
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)
def handle_list(self, element, idx, doc):
"""Handles list tags (ul, ol) and their list items."""
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
def handle_listitem(self, element, idx, doc):
"""Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"])
if nested_lists:
name = element.name
text = self.get_direct_text(element)
# create a list-item
self.parents[self.level + 1] = doc.add_text(
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
elif isinstance(element.text, str):
text = element.text.strip()
doc.add_text(
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
)
else:
_log.warn("list-item has no text: ", element)
def handle_table(self, element, idx, doc):
"""Handles table tags."""
nested_tables = element.find("table")
if nested_tables is not None:
_log.warn("detected nested tables: skipping for now")
return
# Count the number of rows (number of <tr> elements)
num_rows = len(element.find_all("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
for row in element.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = BaseTableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(element.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False
col_idx = 0
# Extract and print the text content of each cell
for _, html_cell in enumerate(cells):
text = html_cell.text
try:
text = self.extract_table_cell_text(html_cell)
except Exception as exc:
_log.warn("exception: ", exc)
exit(-1)
# label = html_cell.name
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
while grid[row_idx][col_idx] is not None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
doc.add_table(data=data, parent=self.parents[self.level])
def get_list_text(list_element, level=0):
"""Recursively extract text from <ul> or <ol> with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
if list_element.name == "ol": # For ordered lists, use numbers
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
# Add numbering for ordered lists
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False):
# Add bullet points for unordered lists
result.append(
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
)
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(get_list_text(nested_list, level + 1))
return result
def extract_table_cell_text(self, cell):
"""Extract text from a table cell, including lists with indents."""
contains_lists = cell.find(["ul", "ol"])
if contains_lists is None:
return cell.text
else:
_log.warn(
"should extract the content correctly for table-cells with lists ..."
)
return cell.text
def handle_figure(self, element, idx, doc):
"""Handles image tags (img)."""
# Extract the image URI from the <img> tag
# image_uri = root.xpath('//figure//img/@src')[0]
contains_captions = element.find(["figcaption"])
if contains_captions is None:
doc.add_picture(
data=BasePictureData(), parent=self.parents[self.level], caption=None
)
else:
texts = []
for item in contains_captions:
texts.append(item.text)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
)
doc.add_picture(
data=BasePictureData(),
parent=self.parents[self.level],
caption=fig_caption,
)
def handle_image(self, element, idx, doc):
"""Handles image tags (img)."""
doc.add_picture(
data=BasePictureData(), parent=self.parents[self.level], caption=None
)

View File

@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
return True return True
def is_paginated(cls) -> bool: def is_paginated(cls) -> bool:
False return False # True? if so, how to handle pages...
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):

View File

@ -55,7 +55,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return True return True
def is_paginated(cls) -> bool: def is_paginated(cls) -> bool:
False return False
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
@ -63,6 +63,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.path_or_stream = None self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.DOCX}
def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
docx_obj = None
try:
docx_obj = docx.Document(self.path_or_stream)
except Exception:
_log.error("could not parse docx")
return doc
# self.initialise()
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
return doc
def update_history(self, name, level, numid, ilevel): def update_history(self, name, level, numid, ilevel):
self.history["names"].append(name) self.history["names"].append(name)
self.history["levels"].append(level) self.history["levels"].append(level)
@ -89,24 +107,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return k return k
return 0 return 0
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.DOCX}
def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
docx_obj = None
try:
docx_obj = docx.Document(self.path_or_stream)
except Exception:
_log.error("could not parse docx")
return doc
# self.initialise()
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
return doc
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body: for element in body:
tag_name = etree.QName(element).localname tag_name = etree.QName(element).localname

View File

@ -23,11 +23,11 @@ _log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False USE_EXPERIMENTAL = False
input_paths = [ input_paths = [
# Path("tests/data/wiki_duck.html"), Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"), Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"), Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/powerpoint_sample.pptx"),
# Path("tests/data/2206.01062.pdf"), Path("tests/data/2206.01062.pdf"),
] ]
input = DocumentConversionInput.from_paths(input_paths) input = DocumentConversionInput.from_paths(input_paths)

50
poetry.lock generated
View File

@ -196,8 +196,8 @@ files = [
lazy-object-proxy = ">=1.4.0" lazy-object-proxy = ">=1.4.0"
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
wrapt = [ wrapt = [
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
{version = ">=1.11,<2", markers = "python_version < \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""},
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
] ]
[[package]] [[package]]
@ -278,6 +278,27 @@ files = [
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"] testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
[[package]]
name = "beautifulsoup4"
version = "4.12.3"
description = "Screen-scraping library"
optional = false
python-versions = ">=3.6.0"
files = [
{file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
{file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
]
[package.dependencies]
soupsieve = ">1.2"
[package.extras]
cchardet = ["cchardet"]
chardet = ["chardet"]
charset-normalizer = ["charset-normalizer"]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]] [[package]]
name = "black" name = "black"
version = "24.8.0" version = "24.8.0"
@ -2369,8 +2390,8 @@ jsonpatch = ">=1.33,<2.0"
langsmith = ">=0.1.112,<0.2.0" langsmith = ">=0.1.112,<0.2.0"
packaging = ">=23.2,<25" packaging = ">=23.2,<25"
pydantic = [ pydantic = [
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
] ]
PyYAML = ">=5.3" PyYAML = ">=5.3"
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
@ -2409,8 +2430,8 @@ files = [
langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""} langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""}
pymilvus = ">=2.4.3,<3.0.0" pymilvus = ">=2.4.3,<3.0.0"
scipy = [ scipy = [
{version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""},
{version = ">=1.7,<2.0", markers = "python_version < \"3.12\""}, {version = ">=1.7,<2.0", markers = "python_version < \"3.12\""},
{version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""},
] ]
[[package]] [[package]]
@ -2442,8 +2463,8 @@ files = [
httpx = ">=0.23.0,<1" httpx = ">=0.23.0,<1"
orjson = ">=3.9.14,<4.0.0" orjson = ">=3.9.14,<4.0.0"
pydantic = [ pydantic = [
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
] ]
requests = ">=2,<3" requests = ">=2,<3"
requests-toolbelt = ">=1.0.0,<2.0.0" requests-toolbelt = ">=1.0.0,<2.0.0"
@ -3640,10 +3661,10 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
[[package]] [[package]]
@ -3776,9 +3797,9 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
python-dateutil = ">=2.8.2" python-dateutil = ">=2.8.2"
pytz = ">=2020.1" pytz = ">=2020.1"
@ -4250,8 +4271,8 @@ files = [
annotated-types = ">=0.6.0" annotated-types = ">=0.6.0"
pydantic-core = "2.23.4" pydantic-core = "2.23.4"
typing-extensions = [ typing-extensions = [
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
{version = ">=4.6.1", markers = "python_version < \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""},
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
] ]
[package.extras] [package.extras]
@ -4419,8 +4440,8 @@ files = [
astroid = ">=2.15.8,<=2.17.0-dev0" astroid = ">=2.15.8,<=2.17.0-dev0"
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
dill = [ dill = [
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
{version = ">=0.2", markers = "python_version < \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""},
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
] ]
isort = ">=4.2.5,<6" isort = ">=4.2.5,<6"
mccabe = ">=0.6,<0.8" mccabe = ">=0.6,<0.8"
@ -5933,6 +5954,17 @@ files = [
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
] ]
[[package]]
name = "soupsieve"
version = "2.6"
description = "A modern CSS selector implementation for Beautiful Soup."
optional = false
python-versions = ">=3.8"
files = [
{file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
{file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
]
[[package]] [[package]]
name = "sqlalchemy" name = "sqlalchemy"
version = "2.0.35" version = "2.0.35"
@ -7287,4 +7319,4 @@ type = ["pytest-mypy"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "a7c265a72158d56174202bdf33259ec2a9a60c54e74e4a8d61647d463f906880" content-hash = "34dcde27e9214be7ebae7b2bc99e407e50c40cc44731552af7e8c97e53b2edf2"

View File

@ -55,6 +55,7 @@ pyarrow = "^16.1.0"
typer = "^0.12.5" typer = "^0.12.5"
python-docx = "^1.1.2" python-docx = "^1.1.2"
python-pptx = "^1.0.2" python-pptx = "^1.0.2"
beautifulsoup4 = "^4.12.3"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}