diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml index 0bdd730c..473326dc 100644 --- a/.github/actions/setup-poetry/action.yml +++ b/.github/actions/setup-poetry/action.yml @@ -8,7 +8,7 @@ runs: using: 'composite' steps: - name: Install poetry - run: pipx install poetry==1.8.3 + run: pipx install poetry==1.8.5 shell: bash - uses: actions/setup-python@v5 with: diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 1cd08f2c..b2a295dc 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -1,19 +1,28 @@ on: workflow_call: +env: + HF_HUB_DOWNLOAD_TIMEOUT: "60" + HF_HUB_ETAG_TIMEOUT: "60" + jobs: run-checks: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - name: Install tesseract - run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config + run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config - name: Set TESSDATA_PREFIX run: | echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" + - name: Cache Hugging Face models + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: huggingface-cache-py${{ matrix.python-version }} - uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} @@ -28,7 +37,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then echo "Skipping $file" continue fi diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2733b522..dd976ea3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,4 +17,3 @@ jobs: - name: Build and push docs if: inputs.deploy run: poetry run mkdocs gh-deploy --force - \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c863f1c2..d210b5ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,8 +16,7 @@ ENV TORCH_HOME=/tmp/ COPY docs/examples/minimal.py /root/minimal.py -RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);' +RUN docling-tools models download # On container environments, always set a thread budget to avoid undesired thread congestion. ENV OMP_NUM_THREADS=4 @@ -25,3 +24,6 @@ ENV OMP_NUM_THREADS=4 # On container shell: # > cd /root/ # > python minimal.py + +# Running as `docker run -e DOCLING_ARTIFACTS_PATH=/root/.cache/docling/models` will use the +# model weights included in the container image. diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index b47b11cd..491330b3 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC): def supports_pagination(cls) -> bool: pass - @abstractmethod def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 829419af..397bfc44 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -24,7 +24,6 @@ _log = logging.getLogger(__name__) class AsciiDocBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py new file mode 100644 index 00000000..9097acf8 --- /dev/null +++ b/docling/backend/csv_backend.py @@ -0,0 +1,125 @@ +import csv +import logging +import warnings +from io import BytesIO, StringIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class CsvDocumentBackend(DeclarativeDocumentBackend): + content: StringIO + + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + # Load content + try: + if isinstance(self.path_or_stream, BytesIO): + self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8")) + elif isinstance(self.path_or_stream, Path): + self.content = StringIO(self.path_or_stream.read_text("utf-8")) + self.valid = True + except Exception as e: + raise RuntimeError( + f"CsvDocumentBackend could not load document with hash {self.document_hash}" + ) from e + return + + def is_valid(self) -> bool: + return self.valid + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.CSV} + + def convert(self) -> DoclingDocument: + """ + Parses the CSV data into a structured document model. + """ + + # Detect CSV dialect + head = self.content.readline() + dialect = csv.Sniffer().sniff(head, ",;\t|:") + _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') + if not dialect.delimiter in {",", ";", "\t", "|", ":"}: + raise RuntimeError( + f"Cannot convert csv with unknown delimiter {dialect.delimiter}." + ) + + # Parce CSV + self.content.seek(0) + result = csv.reader(self.content, dialect=dialect, strict=True) + self.csv_data = list(result) + _log.info(f"Detected {len(self.csv_data)} lines") + + # Ensure uniform column length + expected_length = len(self.csv_data[0]) + is_uniform = all(len(row) == expected_length for row in self.csv_data) + if not is_uniform: + warnings.warn( + f"Inconsistent column lengths detected in CSV data. " + f"Expected {expected_length} columns, but found rows with varying lengths. " + f"Ensure all rows have the same number of columns." + ) + + # Parse the CSV into a structured document model + origin = DocumentOrigin( + filename=self.file.name or "file.csv", + mimetype="text/csv", + binary_hash=self.document_hash, + ) + + doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin) + + if self.is_valid(): + # Convert CSV data to table + if self.csv_data: + num_rows = len(self.csv_data) + num_cols = max(len(row) for row in self.csv_data) + + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], + ) + + # Convert each cell to TableCell + for row_idx, row in enumerate(self.csv_data): + for col_idx, cell_value in enumerate(row): + cell = TableCell( + text=str(cell_value), + row_span=1, # CSV doesn't support merged cells + col_span=1, + start_row_offset_idx=row_idx, + end_row_offset_idx=row_idx + 1, + start_col_offset_idx=col_idx, + end_col_offset_idx=col_idx + 1, + col_header=row_idx == 0, # First row as header + row_header=False, + ) + table_data.table_cells.append(cell) + + doc.add_table(data=table_data) + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) + + return doc diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index ae478885..d14b422f 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,17 +1,22 @@ import logging from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Final, Optional, Union, cast -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString, PageElement, Tag +from bs4.element import PreformattedString from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, GroupLabel, TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer +from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -19,21 +24,38 @@ from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +# tags that generate NodeItem elements +TAGS_FOR_NODE_ITEMS: Final = [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "pre", + "ul", + "ol", + "li", + "table", + "figure", + "img", +] + class HTMLDocumentBackend(DeclarativeDocumentBackend): + @override def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - _log.debug("About to init HTML backend...") - self.soup = None + self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy self.max_levels = 10 self.level = 0 - self.parents = {} # type: ignore + self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} for i in range(0, self.max_levels): self.parents[i] = None - self.labels = {} # type: ignore try: if isinstance(self.path_or_stream, BytesIO): @@ -45,16 +67,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: raise RuntimeError( - f"Could not initialize HTML backend for file with hash {self.document_hash}." + "Could not initialize HTML backend for file with " + f"hash {self.document_hash}." ) from e + @override def is_valid(self) -> bool: return self.soup is not None @classmethod + @override def supports_pagination(cls) -> bool: return False + @override def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() @@ -62,9 +88,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.path_or_stream = None @classmethod - def supported_formats(cls) -> Set[InputFormat]: + @override + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.HTML} + @override def convert(self) -> DoclingDocument: # access self.path_or_stream to load stuff origin = DocumentOrigin( @@ -78,108 +106,118 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if self.is_valid(): assert self.soup is not None + content = self.soup.body or self.soup # Replace
tags with newline characters - for br in self.soup.body.find_all("br"): - br.replace_with("\n") - doc = self.walk(self.soup.body, doc) + # TODO: remove style to avoid losing text from tags like i, b, span, ... + for br in content("br"): + br.replace_with(NavigableString("\n")) + + headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"]) + self.content_layer = ( + ContentLayer.BODY if headers is None else ContentLayer.FURNITURE + ) + self.walk(content, doc) else: raise RuntimeError( - f"Cannot convert doc with {self.document_hash} because the backend failed to init." + f"Cannot convert doc with {self.document_hash} because the backend " + "failed to init." ) return doc - def walk(self, element, doc): - try: - # Iterate over elements in the body of the document - for idx, element in enumerate(element.children): + def walk(self, tag: Tag, doc: DoclingDocument) -> None: + + # Iterate over elements in the body of the document + text: str = "" + for element in tag.children: + if isinstance(element, Tag): try: - self.analyse_element(element, idx, doc) + self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: - - _log.error(" -> error treating child: ", exc_child) - _log.error(" => element: ", element, "\n") + _log.error( + f"Error processing child from tag{tag.name}: {exc_child}" + ) raise exc_child + elif isinstance(element, NavigableString) and not isinstance( + element, PreformattedString + ): + # Floating text outside paragraphs or analyzed tags + text += element + siblings: list[Tag] = [ + item for item in element.next_siblings if isinstance(item, Tag) + ] + if element.next_sibling is None or any( + [item.name in TAGS_FOR_NODE_ITEMS for item in siblings] + ): + text = text.strip() + if text and tag.name in ["div"]: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=text, + content_layer=self.content_layer, + ) + text = "" - except Exception as exc: - pass + return - return doc - - def analyse_element(self, element, idx, doc): - """ - if element.name!=None: - _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") - """ - - if element.name in self.labels: - self.labels[element.name] += 1 + def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: + if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: + self.handle_header(tag, doc) + elif tag.name in ["p"]: + self.handle_paragraph(tag, doc) + elif tag.name in ["pre"]: + self.handle_code(tag, doc) + elif tag.name in ["ul", "ol"]: + self.handle_list(tag, doc) + elif tag.name in ["li"]: + self.handle_list_item(tag, doc) + elif tag.name == "table": + self.handle_table(tag, doc) + elif tag.name == "figure": + self.handle_figure(tag, doc) + elif tag.name == "img": + self.handle_image(tag, doc) else: - self.labels[element.name] = 1 + self.walk(tag, doc) - if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: - self.handle_header(element, idx, doc) - elif element.name in ["p"]: - self.handle_paragraph(element, idx, doc) - elif element.name in ["pre"]: - self.handle_code(element, idx, doc) - elif element.name in ["ul", "ol"]: - self.handle_list(element, idx, doc) - elif element.name in ["li"]: - self.handle_listitem(element, idx, doc) - elif element.name == "table": - self.handle_table(element, idx, doc) - elif element.name == "figure": - self.handle_figure(element, idx, doc) - elif element.name == "img": - self.handle_image(element, idx, doc) - else: - self.walk(element, doc) + def get_text(self, item: PageElement) -> str: + """Get the text content of a tag.""" + parts: list[str] = self.extract_text_recursively(item) - def get_direct_text(self, item): - """Get the direct text of the
  • element (ignoring nested lists).""" - text = item.find(string=True, recursive=False) - if isinstance(text, str): - return text.strip() - - return "" + return "".join(parts) + " " # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item): - result = [] + def extract_text_recursively(self, item: PageElement) -> list[str]: + result: list[str] = [] - if isinstance(item, str): + if isinstance(item, NavigableString): return [item] - if item.name not in ["ul", "ol"]: - try: - # Iterate over the children (and their text and tails) - for child in item: - try: - # Recursively get the child's text content - result.extend(self.extract_text_recursively(child)) - except: - pass - except: - _log.warn("item has no children") - pass + tag = cast(Tag, item) + if tag.name not in ["ul", "ol"]: + for child in tag: + # Recursively get the child's text content + result.extend(self.extract_text_recursively(child)) - return "".join(result) + " " + return ["".join(result) + " "] - def handle_header(self, element, idx, doc): + def handle_header(self, element: Tag, doc: DoclingDocument) -> None: """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) - slevel = hlevel - 1 - - label = DocItemLabel.SECTION_HEADER text = element.text.strip() if hlevel == 1: - for key, val in self.parents.items(): + self.content_layer = ContentLayer.BODY + + for key in self.parents.keys(): self.parents[key] = None self.level = 1 self.parents[self.level] = doc.add_text( - parent=self.parents[0], label=DocItemLabel.TITLE, text=text + parent=self.parents[0], + label=DocItemLabel.TITLE, + text=text, + content_layer=self.content_layer, ) else: if hlevel > self.level: @@ -190,13 +228,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): name=f"header-{i}", label=GroupLabel.SECTION, parent=self.parents[i - 1], + content_layer=self.content_layer, ) self.level = hlevel elif hlevel < self.level: # remove the tail - for key, val in self.parents.items(): + for key in self.parents.keys(): if key > hlevel: self.parents[key] = None self.level = hlevel @@ -205,42 +244,58 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[hlevel - 1], text=text, level=hlevel, + content_layer=self.content_layer, ) - def handle_code(self, element, idx, doc): + def handle_code(self, element: Tag, doc: DoclingDocument) -> None: """Handles monospace code snippets (pre).""" if element.text is None: return text = element.text.strip() - label = DocItemLabel.CODE - if len(text) == 0: - return - doc.add_text(parent=self.parents[self.level], label=label, text=text) + if text: + doc.add_code( + parent=self.parents[self.level], + text=text, + content_layer=self.content_layer, + ) - def handle_paragraph(self, element, idx, doc): + def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: """Handles paragraph tags (p).""" if element.text is None: return text = element.text.strip() - label = DocItemLabel.PARAGRAPH - if len(text) == 0: - return - doc.add_text(parent=self.parents[self.level], label=label, text=text) + if text: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=text, + content_layer=self.content_layer, + ) - def handle_list(self, element, idx, doc): + def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": # create a list group self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], name="list", label=GroupLabel.LIST + parent=self.parents[self.level], + name="list", + label=GroupLabel.LIST, + content_layer=self.content_layer, ) elif element.name == "ol": + start_attr = element.get("start") + start: int = ( + int(start_attr) + if isinstance(start_attr, str) and start_attr.isnumeric() + else 1 + ) # create a list group self.parents[self.level + 1] = doc.add_group( parent=self.parents[self.level], - name="ordered list", + name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, + content_layer=self.content_layer, ) self.level += 1 @@ -249,25 +304,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element, idx, doc): - """Handles listitem tags (li).""" - nested_lists = element.find(["ul", "ol"]) + def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: + """Handles list item tags (li).""" + nested_list = element.find(["ul", "ol"]) - parent_list_label = self.parents[self.level].label - index_in_list = len(self.parents[self.level].children) + 1 + parent = self.parents[self.level] + if parent is None: + _log.debug(f"list-item has no parent in DoclingDocument: {element}") + return + parent_label: str = parent.label + index_in_list = len(parent.children) + 1 + if ( + parent_label == GroupLabel.ORDERED_LIST + and isinstance(parent, GroupItem) + and parent.name + ): + start_in_list: str = parent.name.split(" ")[-1] + start: int = int(start_in_list) if start_in_list.isnumeric() else 1 + index_in_list += start - 1 - if nested_lists: - name = element.name + if nested_list: # Text in list item can be hidden within hierarchy, hence # we need to extract it recursively - text = self.extract_text_recursively(element) + text: str = self.get_text(element) # Flatten text, remove break lines: text = text.replace("\n", "").replace("\r", "") text = " ".join(text.split()).strip() marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = str(index_in_list) enumerated = True @@ -277,7 +343,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, + content_layer=self.content_layer, ) self.level += 1 @@ -286,74 +353,95 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - elif isinstance(element.text, str): + elif element.text.strip(): text = element.text.strip() marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = f"{str(index_in_list)}." enumerated = True doc.add_list_item( text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, + content_layer=self.content_layer, ) else: - _log.warn("list-item has no text: ", element) - - def handle_table(self, element, idx, doc): - """Handles table tags.""" + _log.debug(f"list-item has no text: {element}") + @staticmethod + def parse_table_data(element: Tag) -> Optional[TableData]: nested_tables = element.find("table") if nested_tables is not None: - _log.warn("detected nested tables: skipping for now") - return + _log.debug("Skipping nested table.") + return None # Count the number of rows (number of elements) - num_rows = len(element.find_all("tr")) + num_rows = len(element("tr")) # Find the number of columns (taking into account colspan) num_cols = 0 - for row in element.find_all("tr"): + for row in element("tr"): col_count = 0 - for cell in row.find_all(["td", "th"]): - colspan = int(cell.get("colspan", 1)) + if not isinstance(row, Tag): + continue + for cell in row(["td", "th"]): + if not isinstance(row, Tag): + continue + val = cast(Tag, cell).get("colspan", "1") + colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1 col_count += colspan num_cols = max(num_cols, col_count) - grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] + grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)] data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) # Iterate over the rows in the table - for row_idx, row in enumerate(element.find_all("tr")): + for row_idx, row in enumerate(element("tr")): + if not isinstance(row, Tag): + continue # For each row, find all the column cells (both and ) - cells = row.find_all(["td", "th"]) + cells = row(["td", "th"]) # Check if each cell in the row is a header -> means it is a column header col_header = True - for j, html_cell in enumerate(cells): - if html_cell.name == "td": + for html_cell in cells: + if isinstance(html_cell, Tag) and html_cell.name == "td": col_header = False + # Extract the text content of each cell col_idx = 0 - # Extract and print the text content of each cell - for _, html_cell in enumerate(cells): + for html_cell in cells: + if not isinstance(html_cell, Tag): + continue + # extract inline formulas + for formula in html_cell("inline-formula"): + math_parts = formula.text.split("$$") + if len(math_parts) == 3: + math_formula = f"$${math_parts[1]}$$" + formula.replace_with(NavigableString(math_formula)) + + # TODO: extract content correctly from table-cells with lists text = html_cell.text - try: - text = self.extract_table_cell_text(html_cell) - except Exception as exc: - _log.warn("exception: ", exc) - exit(-1) # label = html_cell.name - - col_span = int(html_cell.get("colspan", 1)) - row_span = int(html_cell.get("rowspan", 1)) + col_val = html_cell.get("colspan", "1") + col_span = ( + int(col_val) + if isinstance(col_val, str) and col_val.isnumeric() + else 1 + ) + row_val = html_cell.get("rowspan", "1") + row_span = ( + int(row_val) + if isinstance(row_val, str) and row_val.isnumeric() + else 1 + ) while grid[row_idx][col_idx] is not None: col_idx += 1 @@ -361,7 +449,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): for c in range(col_span): grid[row_idx + r][col_idx + c] = text - cell = TableCell( + table_cell = TableCell( text=text, row_span=row_span, col_span=col_span, @@ -372,70 +460,87 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): col_header=col_header, row_header=((not col_header) and html_cell.name == "th"), ) - data.table_cells.append(cell) + data.table_cells.append(table_cell) - doc.add_table(data=data, parent=self.parents[self.level]) + return data - def get_list_text(self, list_element, level=0): + def handle_table(self, element: Tag, doc: DoclingDocument) -> None: + """Handles table tags.""" + + table_data = HTMLDocumentBackend.parse_table_data(element) + + if table_data is not None: + doc.add_table( + data=table_data, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) + + def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: """Recursively extract text from