diff --git a/CHANGELOG.md b/CHANGELOG.md index 597dde6e..ab946a87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,107 @@ +## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20 + +### Feature + +* Implement new reading-order model ([#916](https://github.com/DS4SD/docling/issues/916)) ([`c93e369`](https://github.com/DS4SD/docling/commit/c93e36988f1e1e461477223143c2c1fb2162d11f)) + +## [v2.23.1](https://github.com/DS4SD/docling/releases/tag/v2.23.1) - 2025-02-20 + +### Fix + +* Runtime error when Pandas Series is not always of string type ([#1024](https://github.com/DS4SD/docling/issues/1024)) ([`6796f0a`](https://github.com/DS4SD/docling/commit/6796f0a13263281cd48712b3c71579bfd81bb0d1)) + +### Documentation + +* Revamp picture description example ([#1015](https://github.com/DS4SD/docling/issues/1015)) ([`27c0400`](https://github.com/DS4SD/docling/commit/27c04007bc1be7a6f6c90aaf04ea9f4ff8eb1f3d)) + +## [v2.23.0](https://github.com/DS4SD/docling/releases/tag/v2.23.0) - 2025-02-17 + +### Feature + +* Support cuda:n GPU device allocation ([#694](https://github.com/DS4SD/docling/issues/694)) ([`77eb77b`](https://github.com/DS4SD/docling/commit/77eb77bdc2c07b632a1d171826d1855a5218399e)) +* **xml-jats:** Parse XML JATS documents ([#967](https://github.com/DS4SD/docling/issues/967)) ([`428b656`](https://github.com/DS4SD/docling/commit/428b656793cb75d108c69f20c254be7c198cee5c)) + +### Fix + +* Revise DocTags, fix iterate_items to output content_layer in items ([#965](https://github.com/DS4SD/docling/issues/965)) ([`6e75f0b`](https://github.com/DS4SD/docling/commit/6e75f0b5d3ee42738a80049d4cf2fa6d34e8ab97)) + +## [v2.22.0](https://github.com/DS4SD/docling/releases/tag/v2.22.0) - 2025-02-14 + +### Feature + +* Add support for CSV input with new backend to transform CSV files to DoclingDocument ([#945](https://github.com/DS4SD/docling/issues/945)) ([`00d9405`](https://github.com/DS4SD/docling/commit/00d9405b0ac519d321ae54e8150f5facbaabbe14)) +* Introduce the enable_remote_services option to allow remote connections while processing ([#941](https://github.com/DS4SD/docling/issues/941)) ([`2716c7d`](https://github.com/DS4SD/docling/commit/2716c7d4ffb836664178178d3f8d01b7f9112595)) +* Allow artifacts_path to be defined as ENV ([#940](https://github.com/DS4SD/docling/issues/940)) ([`5101e25`](https://github.com/DS4SD/docling/commit/5101e2519e7a5bb727531b1412b1131a7cfbda52)) + +### Fix + +* Update Pillow constraints ([#958](https://github.com/DS4SD/docling/issues/958)) ([`af19c03`](https://github.com/DS4SD/docling/commit/af19c03f6e5e0b24e12d6a3baac6c46a4c8b10d1)) +* Fix the initialization of the TesseractOcrModel ([#935](https://github.com/DS4SD/docling/issues/935)) ([`c47ae70`](https://github.com/DS4SD/docling/commit/c47ae700ece2ea4efee17f82e4667c1ce9a0ed2a)) + +### Documentation + +* Update example Dockerfile with download CLI ([#929](https://github.com/DS4SD/docling/issues/929)) ([`7493d5b`](https://github.com/DS4SD/docling/commit/7493d5b01f8be60294afeffdfb54a62bb74bcc92)) +* Examples for picture descriptions ([#951](https://github.com/DS4SD/docling/issues/951)) ([`2d66e99`](https://github.com/DS4SD/docling/commit/2d66e99b69f39a282109c366fae3679f41c6e081)) + +## [v2.21.0](https://github.com/DS4SD/docling/releases/tag/v2.21.0) - 2025-02-10 + +### Feature + +* Add content_layer property to items to address body, furniture and other roles ([#735](https://github.com/DS4SD/docling/issues/735)) ([`cf78d5b`](https://github.com/DS4SD/docling/commit/cf78d5b7b9f12728270e673857fd299efc01a7db)) + +## [v2.20.0](https://github.com/DS4SD/docling/releases/tag/v2.20.0) - 2025-02-07 + +### Feature + +* Describe pictures using vision models ([#259](https://github.com/DS4SD/docling/issues/259)) ([`4cc6e3e`](https://github.com/DS4SD/docling/commit/4cc6e3ea5e858b367136acc729b723ea0552d22a)) + +### Fix + +* Remove unused httpx ([#919](https://github.com/DS4SD/docling/issues/919)) ([`c18f47c`](https://github.com/DS4SD/docling/commit/c18f47c5c032c49bf3175aecd2236df37c0e9ae1)) + +## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07 + +### Feature + +* New artifacts path and CLI utility ([#876](https://github.com/DS4SD/docling/issues/876)) ([`ed74fe2`](https://github.com/DS4SD/docling/commit/ed74fe2ec0a702834f0deacfdb5717c8c587dab1)) + +### Fix + +* **markdown:** Handle nested lists ([#910](https://github.com/DS4SD/docling/issues/910)) ([`90b766e`](https://github.com/DS4SD/docling/commit/90b766e2ae1695a759191df37c272efc09be5ee3)) +* Test cases for RTL programmatic PDFs and fixes for the formula model ([#903](https://github.com/DS4SD/docling/issues/903)) ([`9114ada`](https://github.com/DS4SD/docling/commit/9114ada7bc4dd45ce0046de2f9d00a80ccb25c79)) +* **msword_backend:** Handle conversion error in label parsing ([#896](https://github.com/DS4SD/docling/issues/896)) ([`722a6eb`](https://github.com/DS4SD/docling/commit/722a6eb7b994a0261312a356df80b2fced121812)) +* Enrichment models batch size and expose picture classifier ([#878](https://github.com/DS4SD/docling/issues/878)) ([`5ad6de0`](https://github.com/DS4SD/docling/commit/5ad6de05600315617b574bd12af553e00b4d316e)) + +### Documentation + +* Introduce example with custom models for RapidOCR ([#874](https://github.com/DS4SD/docling/issues/874)) ([`6d3fea0`](https://github.com/DS4SD/docling/commit/6d3fea019635bd6ca94bd36c3928b28c245d638d)) + +## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03 + +### Feature + +* Expose equation exports ([#869](https://github.com/DS4SD/docling/issues/869)) ([`6a76b49`](https://github.com/DS4SD/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8)) +* Add option to define page range ([#852](https://github.com/DS4SD/docling/issues/852)) ([`70d68b6`](https://github.com/DS4SD/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381)) +* **docx:** Support of SDTs in docx backend ([#853](https://github.com/DS4SD/docling/issues/853)) ([`d727b04`](https://github.com/DS4SD/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e)) +* Python 3.13 support ([#841](https://github.com/DS4SD/docling/issues/841)) ([`4df085a`](https://github.com/DS4SD/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f)) + +### Fix + +* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/DS4SD/docling/issues/873)) ([`5ac2887`](https://github.com/DS4SD/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70)) +* **markdown:** Add support for HTML content ([#855](https://github.com/DS4SD/docling/issues/855)) ([`94751a7`](https://github.com/DS4SD/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62)) +* **docx:** Merged table cells not properly converted ([#857](https://github.com/DS4SD/docling/issues/857)) ([`0cd81a8`](https://github.com/DS4SD/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0)) +* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/DS4SD/docling/issues/868)) ([`eff16b6`](https://github.com/DS4SD/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a)) +* KeyError in tableformer prediction ([#854](https://github.com/DS4SD/docling/issues/854)) ([`b1cf796`](https://github.com/DS4SD/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee)) +* Fixed docx import with headers that are also lists ([#842](https://github.com/DS4SD/docling/issues/842)) ([`2c037ae`](https://github.com/DS4SD/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3)) +* Use new add_code in html backend and add more typing hints ([#850](https://github.com/DS4SD/docling/issues/850)) ([`2a1f8af`](https://github.com/DS4SD/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933)) +* **markdown:** Fix empty block handling ([#843](https://github.com/DS4SD/docling/issues/843)) ([`bccb022`](https://github.com/DS4SD/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9)) +* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/DS4SD/docling/issues/837)) ([`fea0a99`](https://github.com/DS4SD/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e)) + +### Documentation + +* Updated the readme with upcoming features ([#831](https://github.com/DS4SD/docling/issues/831)) ([`d7c0828`](https://github.com/DS4SD/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd)) +* Add example for inspection of picture content ([#624](https://github.com/DS4SD/docling/issues/624)) ([`f9144f2`](https://github.com/DS4SD/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781)) + ## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 ### Feature diff --git a/Dockerfile b/Dockerfile index c863f1c2..d210b5ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,8 +16,7 @@ ENV TORCH_HOME=/tmp/ COPY docs/examples/minimal.py /root/minimal.py -RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);' +RUN docling-tools models download # On container environments, always set a thread budget to avoid undesired thread congestion. ENV OMP_NUM_THREADS=4 @@ -25,3 +24,6 @@ ENV OMP_NUM_THREADS=4 # On container shell: # > cd /root/ # > python minimal.py + +# Running as `docker run -e DOCLING_ARTIFACTS_PATH=/root/.cache/docling/models` will use the +# model weights included in the container image. diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py new file mode 100644 index 00000000..9097acf8 --- /dev/null +++ b/docling/backend/csv_backend.py @@ -0,0 +1,125 @@ +import csv +import logging +import warnings +from io import BytesIO, StringIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class CsvDocumentBackend(DeclarativeDocumentBackend): + content: StringIO + + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + # Load content + try: + if isinstance(self.path_or_stream, BytesIO): + self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8")) + elif isinstance(self.path_or_stream, Path): + self.content = StringIO(self.path_or_stream.read_text("utf-8")) + self.valid = True + except Exception as e: + raise RuntimeError( + f"CsvDocumentBackend could not load document with hash {self.document_hash}" + ) from e + return + + def is_valid(self) -> bool: + return self.valid + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.CSV} + + def convert(self) -> DoclingDocument: + """ + Parses the CSV data into a structured document model. + """ + + # Detect CSV dialect + head = self.content.readline() + dialect = csv.Sniffer().sniff(head, ",;\t|:") + _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') + if not dialect.delimiter in {",", ";", "\t", "|", ":"}: + raise RuntimeError( + f"Cannot convert csv with unknown delimiter {dialect.delimiter}." + ) + + # Parce CSV + self.content.seek(0) + result = csv.reader(self.content, dialect=dialect, strict=True) + self.csv_data = list(result) + _log.info(f"Detected {len(self.csv_data)} lines") + + # Ensure uniform column length + expected_length = len(self.csv_data[0]) + is_uniform = all(len(row) == expected_length for row in self.csv_data) + if not is_uniform: + warnings.warn( + f"Inconsistent column lengths detected in CSV data. " + f"Expected {expected_length} columns, but found rows with varying lengths. " + f"Ensure all rows have the same number of columns." + ) + + # Parse the CSV into a structured document model + origin = DocumentOrigin( + filename=self.file.name or "file.csv", + mimetype="text/csv", + binary_hash=self.document_hash, + ) + + doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin) + + if self.is_valid(): + # Convert CSV data to table + if self.csv_data: + num_rows = len(self.csv_data) + num_cols = max(len(row) for row in self.csv_data) + + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], + ) + + # Convert each cell to TableCell + for row_idx, row in enumerate(self.csv_data): + for col_idx, cell_value in enumerate(row): + cell = TableCell( + text=str(cell_value), + row_span=1, # CSV doesn't support merged cells + col_span=1, + start_row_offset_idx=row_idx, + end_row_offset_idx=row_idx + 1, + start_col_offset_idx=col_idx, + end_col_offset_idx=col_idx + 1, + col_header=row_idx == 0, # First row as header + row_header=False, + ) + table_data.table_cells.append(cell) + + doc.add_table(data=table_data) + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) + + return doc diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 286dfbfa..00ef05b4 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,17 +1,21 @@ import logging from io import BytesIO from pathlib import Path -from typing import Optional, Set, Union +from typing import Final, Optional, Union, cast -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup, NavigableString, PageElement, Tag +from bs4.element import PreformattedString from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, GroupLabel, TableCell, TableData, ) +from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -19,21 +23,38 @@ from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +# tags that generate NodeItem elements +TAGS_FOR_NODE_ITEMS: Final = [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "pre", + "ul", + "ol", + "li", + "table", + "figure", + "img", +] + class HTMLDocumentBackend(DeclarativeDocumentBackend): + @override def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - _log.debug("About to init HTML backend...") self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy self.max_levels = 10 self.level = 0 - self.parents = {} # type: ignore + self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} for i in range(0, self.max_levels): self.parents[i] = None - self.labels = {} # type: ignore try: if isinstance(self.path_or_stream, BytesIO): @@ -48,13 +69,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): f"Could not initialize HTML backend for file with hash {self.document_hash}." ) from e + @override def is_valid(self) -> bool: return self.soup is not None @classmethod + @override def supports_pagination(cls) -> bool: return False + @override def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() @@ -62,9 +86,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.path_or_stream = None @classmethod - def supported_formats(cls) -> Set[InputFormat]: + @override + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.HTML} + @override def convert(self) -> DoclingDocument: # access self.path_or_stream to load stuff origin = DocumentOrigin( @@ -80,102 +106,98 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): assert self.soup is not None content = self.soup.body or self.soup # Replace
tags with newline characters - for br in content.find_all("br"): - br.replace_with("\n") - doc = self.walk(content, doc) + # TODO: remove style to avoid losing text from tags like i, b, span, ... + for br in content("br"): + br.replace_with(NavigableString("\n")) + self.walk(content, doc) else: raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) return doc - def walk(self, element: Tag, doc: DoclingDocument): - try: - # Iterate over elements in the body of the document - for idx, element in enumerate(element.children): + def walk(self, tag: Tag, doc: DoclingDocument) -> None: + # Iterate over elements in the body of the document + text: str = "" + for element in tag.children: + if isinstance(element, Tag): try: - self.analyse_element(element, idx, doc) + self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: - - _log.error(" -> error treating child: ", exc_child) - _log.error(" => element: ", element, "\n") + _log.error( + f"Error processing child from tag{tag.name}: {exc_child}" + ) raise exc_child + elif isinstance(element, NavigableString) and not isinstance( + element, PreformattedString + ): + # Floating text outside paragraphs or analyzed tags + text += element + siblings: list[Tag] = [ + item for item in element.next_siblings if isinstance(item, Tag) + ] + if element.next_sibling is None or any( + [item.name in TAGS_FOR_NODE_ITEMS for item in siblings] + ): + text = text.strip() + if text and tag.name in ["div"]: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.PARAGRAPH, + text=text, + ) + text = "" - except Exception as exc: - pass + return - return doc - - def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): - """ - if element.name!=None: - _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") - """ - - if element.name in self.labels: - self.labels[element.name] += 1 + def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: + if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: + self.handle_header(tag, doc) + elif tag.name in ["p"]: + self.handle_paragraph(tag, doc) + elif tag.name in ["pre"]: + self.handle_code(tag, doc) + elif tag.name in ["ul", "ol"]: + self.handle_list(tag, doc) + elif tag.name in ["li"]: + self.handle_list_item(tag, doc) + elif tag.name == "table": + self.handle_table(tag, doc) + elif tag.name == "figure": + self.handle_figure(tag, doc) + elif tag.name == "img": + self.handle_image(doc) else: - self.labels[element.name] = 1 + self.walk(tag, doc) - if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: - self.handle_header(element, idx, doc) - elif element.name in ["p"]: - self.handle_paragraph(element, idx, doc) - elif element.name in ["pre"]: - self.handle_code(element, idx, doc) - elif element.name in ["ul", "ol"]: - self.handle_list(element, idx, doc) - elif element.name in ["li"]: - self.handle_listitem(element, idx, doc) - elif element.name == "table": - self.handle_table(element, idx, doc) - elif element.name == "figure": - self.handle_figure(element, idx, doc) - elif element.name == "img": - self.handle_image(element, idx, doc) - else: - self.walk(element, doc) + def get_text(self, item: PageElement) -> str: + """Get the text content of a tag.""" + parts: list[str] = self.extract_text_recursively(item) - def get_direct_text(self, item: Tag): - """Get the direct text of the
  • element (ignoring nested lists).""" - text = item.find(string=True, recursive=False) - if isinstance(text, str): - return text.strip() - - return "" + return "".join(parts) + " " # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item: Tag): - result = [] + def extract_text_recursively(self, item: PageElement) -> list[str]: + result: list[str] = [] - if isinstance(item, str): + if isinstance(item, NavigableString): return [item] - if item.name not in ["ul", "ol"]: - try: - # Iterate over the children (and their text and tails) - for child in item: - try: - # Recursively get the child's text content - result.extend(self.extract_text_recursively(child)) - except: - pass - except: - _log.warn("item has no children") - pass + tag = cast(Tag, item) + if tag.name not in ["ul", "ol"]: + for child in tag: + # Recursively get the child's text content + result.extend(self.extract_text_recursively(child)) - return "".join(result) + " " + return ["".join(result) + " "] - def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_header(self, element: Tag, doc: DoclingDocument) -> None: """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) - slevel = hlevel - 1 - - label = DocItemLabel.SECTION_HEADER text = element.text.strip() if hlevel == 1: - for key, val in self.parents.items(): + for key in self.parents.keys(): self.parents[key] = None self.level = 1 @@ -197,7 +219,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif hlevel < self.level: # remove the tail - for key, val in self.parents.items(): + for key in self.parents.keys(): if key > hlevel: self.parents[key] = None self.level = hlevel @@ -208,27 +230,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): level=hlevel, ) - def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_code(self, element: Tag, doc: DoclingDocument) -> None: """Handles monospace code snippets (pre).""" if element.text is None: return text = element.text.strip() - label = DocItemLabel.CODE - if len(text) == 0: - return - doc.add_code(parent=self.parents[self.level], text=text) + if text: + doc.add_code(parent=self.parents[self.level], text=text) - def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: """Handles paragraph tags (p).""" if element.text is None: return text = element.text.strip() label = DocItemLabel.PARAGRAPH - if len(text) == 0: - return - doc.add_text(parent=self.parents[self.level], label=label, text=text) + if text: + doc.add_text(parent=self.parents[self.level], label=label, text=text) - def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": @@ -250,25 +269,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): + def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: """Handles listitem tags (li).""" - nested_lists = element.find(["ul", "ol"]) + nested_list = element.find(["ul", "ol"]) - parent_list_label = self.parents[self.level].label - index_in_list = len(self.parents[self.level].children) + 1 + parent = self.parents[self.level] + if parent is None: + _log.warning(f"list-item has no parent in DoclingDocument: {element}") + return + parent_label: str = parent.label + index_in_list = len(parent.children) + 1 - if nested_lists: - name = element.name + if nested_list: # Text in list item can be hidden within hierarchy, hence # we need to extract it recursively - text = self.extract_text_recursively(element) + text: str = self.get_text(element) # Flatten text, remove break lines: text = text.replace("\n", "").replace("\r", "") text = " ".join(text.split()).strip() marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = str(index_in_list) enumerated = True @@ -278,7 +300,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, ) self.level += 1 @@ -287,74 +309,94 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - elif isinstance(element.text, str): + elif element.text.strip(): text = element.text.strip() marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = f"{str(index_in_list)}." enumerated = True doc.add_list_item( text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, ) else: - _log.warn("list-item has no text: ", element) - - def handle_table(self, element: Tag, idx: int, doc: DoclingDocument): - """Handles table tags.""" + _log.warning(f"list-item has no text: {element}") + @staticmethod + def parse_table_data(element: Tag) -> Optional[TableData]: nested_tables = element.find("table") if nested_tables is not None: - _log.warn("detected nested tables: skipping for now") - return + _log.warning("Skipping nested table.") + return None # Count the number of rows (number of elements) - num_rows = len(element.find_all("tr")) + num_rows = len(element("tr")) # Find the number of columns (taking into account colspan) num_cols = 0 - for row in element.find_all("tr"): + for row in element("tr"): col_count = 0 - for cell in row.find_all(["td", "th"]): - colspan = int(cell.get("colspan", 1)) + if not isinstance(row, Tag): + continue + for cell in row(["td", "th"]): + if not isinstance(row, Tag): + continue + val = cast(Tag, cell).get("colspan", "1") + colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1 col_count += colspan num_cols = max(num_cols, col_count) - grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] + grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)] data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) # Iterate over the rows in the table - for row_idx, row in enumerate(element.find_all("tr")): + for row_idx, row in enumerate(element("tr")): + if not isinstance(row, Tag): + continue # For each row, find all the column cells (both and ) - cells = row.find_all(["td", "th"]) + cells = row(["td", "th"]) # Check if each cell in the row is a header -> means it is a column header col_header = True - for j, html_cell in enumerate(cells): - if html_cell.name == "td": + for html_cell in cells: + if isinstance(html_cell, Tag) and html_cell.name == "td": col_header = False + # Extract the text content of each cell col_idx = 0 - # Extract and print the text content of each cell - for _, html_cell in enumerate(cells): + for html_cell in cells: + if not isinstance(html_cell, Tag): + continue + # extract inline formulas + for formula in html_cell("inline-formula"): + math_parts = formula.text.split("$$") + if len(math_parts) == 3: + math_formula = f"$${math_parts[1]}$$" + formula.replace_with(NavigableString(math_formula)) + + # TODO: extract content correctly from table-cells with lists text = html_cell.text - try: - text = self.extract_table_cell_text(html_cell) - except Exception as exc: - _log.warn("exception: ", exc) - exit(-1) # label = html_cell.name - - col_span = int(html_cell.get("colspan", 1)) - row_span = int(html_cell.get("rowspan", 1)) + col_val = html_cell.get("colspan", "1") + col_span = ( + int(col_val) + if isinstance(col_val, str) and col_val.isnumeric() + else 1 + ) + row_val = html_cell.get("rowspan", "1") + row_span = ( + int(row_val) + if isinstance(row_val, str) and row_val.isnumeric() + else 1 + ) while grid[row_idx][col_idx] is not None: col_idx += 1 @@ -362,7 +404,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): for c in range(col_span): grid[row_idx + r][col_idx + c] = text - cell = TableCell( + table_cell = TableCell( text=text, row_span=row_span, col_span=col_span, @@ -373,57 +415,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): col_header=col_header, row_header=((not col_header) and html_cell.name == "th"), ) - data.table_cells.append(cell) + data.table_cells.append(table_cell) - doc.add_table(data=data, parent=self.parents[self.level]) + return data - def get_list_text(self, list_element: Tag, level=0): + def handle_table(self, element: Tag, doc: DoclingDocument) -> None: + """Handles table tags.""" + + table_data = HTMLDocumentBackend.parse_table_data(element) + + if table_data is not None: + doc.add_table(data=table_data, parent=self.parents[self.level]) + + def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: """Recursively extract text from