diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index b518850e..60bb838e 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -2,7 +2,7 @@ import logging import random from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin @@ -77,21 +77,9 @@ class DoclingParseV2PageBackend(PdfPageBackend): return text_piece - def get_text_cells(self) -> Iterable[Cell]: + def make_cells(self, page_size, cells_data, cells_header): cells: List[Cell] = [] cell_counter = 0 - - if not self.valid: - return cells - - page_size = self.get_size() - - parser_width = self._dpage["sanitized"]["dimension"]["width"] - parser_height = self._dpage["sanitized"]["dimension"]["height"] - - cells_data = self._dpage["sanitized"]["cells"]["data"] - cells_header = self._dpage["sanitized"]["cells"]["header"] - for i, cell_data in enumerate(cells_data): x0 = cell_data[cells_header.index("x0")] y0 = cell_data[cells_header.index("y0")] @@ -119,6 +107,28 @@ class DoclingParseV2PageBackend(PdfPageBackend): ) ) cell_counter += 1 + return cells + + def get_text_cells(self) -> Tuple[Iterable[Cell], Iterable[Cell]]: + cells: List[Cell] = [] + word_cells: List[Cell] = [] + + if not self.valid: + return cells, word_cells + + page_size = self.get_size() + + parser_width = self._dpage["sanitized"]["dimension"]["width"] + parser_height = self._dpage["sanitized"]["dimension"]["height"] + + cells_data = self._dpage["sanitized"]["cells"]["data"] + cells_header = self._dpage["sanitized"]["cells"]["header"] + + word_cells_data = self._dpage["original"]["cells"]["data"] + word_cells_header = self._dpage["original"]["cells"]["header"] + + cells = self.make_cells(page_size, cells_data, cells_header) + word_cells = self.make_cells(page_size, word_cells_data, word_cells_header) def draw_clusters_and_cells(): image = ( @@ -137,7 +147,7 @@ class DoclingParseV2PageBackend(PdfPageBackend): # draw_clusters_and_cells() - return cells + return cells, word_cells def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 32 * 32 diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index cd7a0815..295f9030 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from io import BytesIO from pathlib import Path -from typing import Iterable, Optional, Set, Union +from typing import Iterable, Optional, Set, Tuple, Union from docling_core.types.doc import BoundingBox, Size from PIL import Image @@ -18,7 +18,7 @@ class PdfPageBackend(ABC): pass @abstractmethod - def get_text_cells(self) -> Iterable[Cell]: + def get_text_cells(self) -> Tuple[Iterable[Cell], Optional[Iterable[Cell]]]: pass @abstractmethod diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index d06b6097..9cbd87a2 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -180,6 +180,7 @@ class Page(BaseModel): # page_hash: Optional[str] = None size: Optional[Size] = None cells: List[Cell] = [] + word_cells: List[Cell] = [] predictions: PagePredictions = PagePredictions() assembled: Optional[AssembledUnit] = None diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 63f1a4f6..316ce2b3 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -53,7 +53,17 @@ class PagePreprocessingModel(BasePageModel): def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page: assert page._backend is not None - page.cells = list(page._backend.get_text_cells()) + # page.cells = list(page._backend.get_text_cells()) + cells_result = page._backend.get_text_cells() + if type(cells_result) is not tuple: + # Backend supports just PDF cells + page.cells = list(cells_result) + else: + # Backend also supports word cells + pdf_cells, word_cells = cells_result + page.cells = list(pdf_cells) + # preserve word_cells for usage in tables + page.word_cells = list(word_cells) # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 12bc2838..309b7110 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -122,7 +122,12 @@ class TableStructureModel(BasePageModel): continue tokens = [] - for c in page.cells: + + page_cells = page.cells + if hasattr(page, "word_cells"): + page_cells = page.word_cells + + for c in page_cells: for cluster, _ in in_tables: if c.bbox.area() > 0: if (