From 89e58ca730a4cb79e000b842394957bcc866644c Mon Sep 17 00:00:00 2001 From: Maxim Lysak Date: Tue, 8 Oct 2024 11:14:44 +0200 Subject: [PATCH] Added HTML backend implementation, few improvements for other backends Signed-off-by: Maxim Lysak --- docling/backend/html_backend.py | 359 +++++++++++++++++++++++- docling/backend/mspowerpoint_backend.py | 2 +- docling/backend/msword_backend.py | 38 +-- examples/run_with_formats.py | 4 +- poetry.lock | 50 +++- pyproject.toml | 1 + 6 files changed, 420 insertions(+), 34 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 8ecb9579..c0315aaf 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,26 +1,49 @@ +import logging from io import BytesIO from pathlib import Path from typing import Set, Union +from bs4 import BeautifulSoup from docling_core.types.experimental import ( + BasePictureData, + BaseTableData, DescriptionItem, DocItemLabel, DoclingDocument, + DocumentOrigin, + ImageRef, + PictureItem, + SectionHeaderItem, + TableCell, + TableItem, ) +from docling_core.types.experimental.labels import DocItemLabel, GroupLabel from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat +_log = logging.getLogger(__name__) + class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): super().__init__(path_or_stream, document_hash) + self.soup = None + # HTML file: + self.path_or_stream = path_or_stream + # Initialise the parents for the hierarchy + self.max_levels = 10 + self.level = 0 + self.parents = {} + for i in range(0, self.max_levels): + self.parents[i] = None + self.labels = {} def is_valid(self) -> bool: return True def is_paginated(cls) -> bool: - False + return False def unload(self): if isinstance(self.path_or_stream, BytesIO): @@ -33,8 +56,338 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return {InputFormat.HTML} def convert(self) -> DoclingDocument: - # access self.path_or_stream to load stuff doc = DoclingDocument(description=DescriptionItem(), name="dummy") - doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT) + + try: + with open(self.path_or_stream, "r", encoding="utf-8") as f: + html_content = f.read() + self.soup = BeautifulSoup(html_content, "html.parser") + except Exception as e: + _log.error("could not parse html: {}".format(e)) + return doc + + # Replace
tags with newline characters + for br in self.soup.body.find_all("br"): + br.replace_with("\n") + doc = self.walk(self.soup.body, doc) + return doc + + def walk(self, element, doc): + try: + # Iterate over elements in the body of the document + for idx, element in enumerate(element.children): + try: + self.analyse_element(element, idx, doc) + except Exception as exc_child: + _log.error(" -> error treating child: ", exc_child) + _log.error(" => element: ", element, "\n") + pass + + except Exception as exc: + pass + + return doc + + def analyse_element(self, element, idx, doc): + """ + if element.name!=None: + print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") + """ + + if element.name in self.labels: + self.labels[element.name] += 1 + else: + self.labels[element.name] = 1 + + if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: + self.handle_header(element, idx, doc) + elif element.name in ["p"]: + self.handle_paragraph(element, idx, doc) + elif element.name in ["ul", "ol"]: + self.handle_list(element, idx, doc) + elif element.name in ["li"]: + self.handle_listitem(element, idx, doc) + elif element.name == "table": + self.handle_table(element, idx, doc) + elif element.name == "figure": + self.handle_figure(element, idx, doc) + elif element.name == "img": + self.handle_image(element, idx, doc) + else: + self.walk(element, doc) + + def get_direct_text(self, item): + """Get the direct text of the
  • element (ignoring nested lists).""" + text = item.find(string=True, recursive=False) + + if isinstance(text, str): + return text.strip() + + return "" + + # Function to recursively extract text from all child nodes + def extract_text_recursively(self, item): + result = [] + + if isinstance(item, str): + return [item] + + result.append(self.get_direct_text(item)) + + try: + # Iterate over the children (and their text and tails) + for child in item: + try: + # Recursively get the child's text content + result.extend(self.extract_text_recursively(child)) + except: + pass + except: + _log.warn("item has no children") + pass + + return " ".join(result) + + def handle_header(self, element, idx, doc): + """Handles header tags (h1, h2, etc.).""" + hlevel = int(element.name.replace("h", "")) + slevel = hlevel - 1 + + label = DocItemLabel.SECTION_HEADER + text = element.text.strip() + + if hlevel == 1: + for key, val in self.parents.items(): + self.parents[key] = None + + self.level = 1 + self.parents[self.level] = doc.add_text( + parent=self.parents[0], label=DocItemLabel.TITLE, text=text + ) + + elif hlevel == self.level: + self.parents[hlevel] = doc.add_text( + parent=self.parents[hlevel - 1], label=label, text=text + ) + + elif hlevel > self.level: + + # add invisible group + for i in range(self.level + 1, hlevel): + self.parents[i] = doc.add_group( + name=f"header-{i}", + label=GroupLabel.SECTION, + parent=self.parents[i - 1], + ) + + self.parents[hlevel] = doc.add_text( + parent=self.parents[hlevel - 1], label=label, text=text + ) + self.level = hlevel + + elif hlevel < self.level: + + # remove the tail + for key, val in self.parents.items(): + if key > hlevel: + self.parents[key] = None + + self.parents[hlevel] = doc.add_text( + parent=self.parents[hlevel - 1], label=label, text=text + ) + self.level = hlevel + + def handle_paragraph(self, element, idx, doc): + """Handles paragraph tags (p).""" + if element.text is None: + return + text = element.text.strip() + label = DocItemLabel.PARAGRAPH + if len(text) == 0: + return + doc.add_text(parent=self.parents[self.level], label=label, text=text) + + def handle_list(self, element, idx, doc): + """Handles list tags (ul, ol) and their list items.""" + + # create a list group + self.parents[self.level + 1] = doc.add_group( + parent=self.parents[self.level], name="list", label=GroupLabel.LIST + ) + self.level += 1 + + self.walk(element, doc) + + self.parents[self.level + 1] = None + self.level -= 1 + + def handle_listitem(self, element, idx, doc): + """Handles listitem tags (li).""" + nested_lists = element.find(["ul", "ol"]) + if nested_lists: + name = element.name + text = self.get_direct_text(element) + + # create a list-item + self.parents[self.level + 1] = doc.add_text( + label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level] + ) + self.level += 1 + + self.walk(element, doc) + + self.parents[self.level + 1] = None + self.level -= 1 + + elif isinstance(element.text, str): + text = element.text.strip() + + doc.add_text( + label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level] + ) + else: + _log.warn("list-item has no text: ", element) + + def handle_table(self, element, idx, doc): + """Handles table tags.""" + + nested_tables = element.find("table") + if nested_tables is not None: + _log.warn("detected nested tables: skipping for now") + return + + # Count the number of rows (number of elements) + num_rows = len(element.find_all("tr")) + + # Find the number of columns (taking into account colspan) + num_cols = 0 + for row in element.find_all("tr"): + col_count = 0 + for cell in row.find_all(["td", "th"]): + colspan = int(cell.get("colspan", 1)) + col_count += colspan + num_cols = max(num_cols, col_count) + + grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] + + data = BaseTableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) + + # Iterate over the rows in the table + for row_idx, row in enumerate(element.find_all("tr")): + + # For each row, find all the column cells (both and ) + cells = row.find_all(["td", "th"]) + + # Check if each cell in the row is a header -> means it is a column header + col_header = True + for j, html_cell in enumerate(cells): + if html_cell.name == "td": + col_header = False + + col_idx = 0 + # Extract and print the text content of each cell + for _, html_cell in enumerate(cells): + + text = html_cell.text + try: + text = self.extract_table_cell_text(html_cell) + except Exception as exc: + _log.warn("exception: ", exc) + exit(-1) + + # label = html_cell.name + + col_span = int(html_cell.get("colspan", 1)) + row_span = int(html_cell.get("rowspan", 1)) + + while grid[row_idx][col_idx] is not None: + col_idx += 1 + for r in range(row_span): + for c in range(col_span): + grid[row_idx + r][col_idx + c] = text + + cell = TableCell( + text=text, + row_span=row_span, + col_span=col_span, + start_row_offset_idx=row_idx, + end_row_offset_idx=row_idx + row_span, + start_col_offset_idx=col_idx, + end_col_offset_idx=col_idx + col_span, + col_header=col_header, + row_header=((not col_header) and html_cell.name == "th"), + ) + data.table_cells.append(cell) + + doc.add_table(data=data, parent=self.parents[self.level]) + + def get_list_text(list_element, level=0): + """Recursively extract text from
      or
        with proper indentation.""" + result = [] + bullet_char = "*" # Default bullet character for unordered lists + + if list_element.name == "ol": # For ordered lists, use numbers + for i, li in enumerate(list_element.find_all("li", recursive=False), 1): + # Add numbering for ordered lists + result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}") + # Handle nested lists + nested_list = li.find(["ul", "ol"]) + if nested_list: + result.extend(get_list_text(nested_list, level + 1)) + elif list_element.name == "ul": # For unordered lists, use bullet points + for li in list_element.find_all("li", recursive=False): + # Add bullet points for unordered lists + result.append( + f"{' ' * level}{bullet_char} {li.get_text(strip=True)}" + ) + # Handle nested lists + nested_list = li.find(["ul", "ol"]) + if nested_list: + result.extend(get_list_text(nested_list, level + 1)) + + return result + + def extract_table_cell_text(self, cell): + """Extract text from a table cell, including lists with indents.""" + contains_lists = cell.find(["ul", "ol"]) + if contains_lists is None: + return cell.text + else: + _log.warn( + "should extract the content correctly for table-cells with lists ..." + ) + return cell.text + + def handle_figure(self, element, idx, doc): + """Handles image tags (img).""" + + # Extract the image URI from the tag + # image_uri = root.xpath('//figure//img/@src')[0] + + contains_captions = element.find(["figcaption"]) + if contains_captions is None: + doc.add_picture( + data=BasePictureData(), parent=self.parents[self.level], caption=None + ) + + else: + texts = [] + for item in contains_captions: + texts.append(item.text) + + fig_caption = doc.add_text( + label=DocItemLabel.CAPTION, text=("".join(texts)).strip() + ) + doc.add_picture( + data=BasePictureData(), + parent=self.parents[self.level], + caption=fig_caption, + ) + + def handle_image(self, element, idx, doc): + """Handles image tags (img).""" + doc.add_picture( + data=BasePictureData(), parent=self.parents[self.level], caption=None + ) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 2a1cf35b..d7ec1c67 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend): return True def is_paginated(cls) -> bool: - False + return False # True? if so, how to handle pages... def unload(self): if isinstance(self.path_or_stream, BytesIO): diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index c284eee1..c9b12014 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -55,7 +55,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return True def is_paginated(cls) -> bool: - False + return False def unload(self): if isinstance(self.path_or_stream, BytesIO): @@ -63,6 +63,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.path_or_stream = None + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.DOCX} + + def convert(self) -> DoclingDocument: + # Parses the DOCX into a structured document model. + doc = DoclingDocument(description=DescriptionItem(), name="dummy") + docx_obj = None + try: + docx_obj = docx.Document(self.path_or_stream) + except Exception: + _log.error("could not parse docx") + return doc + + # self.initialise() + doc = self.walk_linear(docx_obj.element.body, docx_obj, doc) + return doc + def update_history(self, name, level, numid, ilevel): self.history["names"].append(name) self.history["levels"].append(level) @@ -89,24 +107,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return k return 0 - @classmethod - def supported_formats(cls) -> Set[InputFormat]: - return {InputFormat.DOCX} - - def convert(self) -> DoclingDocument: - # Parses the DOCX into a structured document model. - doc = DoclingDocument(description=DescriptionItem(), name="dummy") - docx_obj = None - try: - docx_obj = docx.Document(self.path_or_stream) - except Exception: - _log.error("could not parse docx") - return doc - - # self.initialise() - doc = self.walk_linear(docx_obj.element.body, docx_obj, doc) - return doc - def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index 3080c0ab..f143c9f8 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -23,11 +23,11 @@ _log = logging.getLogger(__name__) USE_EXPERIMENTAL = False input_paths = [ - # Path("tests/data/wiki_duck.html"), + Path("tests/data/wiki_duck.html"), Path("tests/data/word_sample.docx"), Path("tests/data/lorem_ipsum.docx"), Path("tests/data/powerpoint_sample.pptx"), - # Path("tests/data/2206.01062.pdf"), + Path("tests/data/2206.01062.pdf"), ] input = DocumentConversionInput.from_paths(input_paths) diff --git a/poetry.lock b/poetry.lock index fe873c75..f6acbe3a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -196,8 +196,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -278,6 +278,27 @@ files = [ docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"] +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "black" version = "24.8.0" @@ -2369,8 +2390,8 @@ jsonpatch = ">=1.33,<2.0" langsmith = ">=0.1.112,<0.2.0" packaging = ">=23.2,<25" pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" @@ -2409,8 +2430,8 @@ files = [ langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""} pymilvus = ">=2.4.3,<3.0.0" scipy = [ - {version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""}, {version = ">=1.7,<2.0", markers = "python_version < \"3.12\""}, + {version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -2442,8 +2463,8 @@ files = [ httpx = ">=0.23.0,<1" orjson = ">=3.9.14,<4.0.0" pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] requests = ">=2,<3" requests-toolbelt = ">=1.0.0,<2.0.0" @@ -3640,10 +3661,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3776,9 +3797,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4250,8 +4271,8 @@ files = [ annotated-types = ">=0.6.0" pydantic-core = "2.23.4" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4419,8 +4440,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -5933,6 +5954,17 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "soupsieve" +version = "2.6" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, + {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, +] + [[package]] name = "sqlalchemy" version = "2.0.35" @@ -7287,4 +7319,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "a7c265a72158d56174202bdf33259ec2a9a60c54e74e4a8d61647d463f906880" +content-hash = "34dcde27e9214be7ebae7b2bc99e407e50c40cc44731552af7e8c97e53b2edf2" diff --git a/pyproject.toml b/pyproject.toml index 17f587e3..6ad7ade5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ pyarrow = "^16.1.0" typer = "^0.12.5" python-docx = "^1.1.2" python-pptx = "^1.0.2" +beautifulsoup4 = "^4.12.3" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"}