experimental, wip - adding optional word-level cells to page_processing and table model

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-11-13 09:45:23 +01:00
parent fb8ba861e2
commit e4383cee56
5 changed files with 45 additions and 19 deletions

View File

@ -2,7 +2,7 @@ import logging
import random
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -77,21 +77,9 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece
def get_text_cells(self) -> Iterable[Cell]:
def make_cells(self, page_size, cells_data, cells_header):
cells: List[Cell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
@ -119,6 +107,28 @@ class DoclingParseV2PageBackend(PdfPageBackend):
)
)
cell_counter += 1
return cells
def get_text_cells(self) -> Tuple[Iterable[Cell], Iterable[Cell]]:
cells: List[Cell] = []
word_cells: List[Cell] = []
if not self.valid:
return cells, word_cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
word_cells_data = self._dpage["original"]["cells"]["data"]
word_cells_header = self._dpage["original"]["cells"]["header"]
cells = self.make_cells(page_size, cells_data, cells_header)
word_cells = self.make_cells(page_size, word_cells_data, word_cells_header)
def draw_clusters_and_cells():
image = (
@ -137,7 +147,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
# draw_clusters_and_cells()
return cells
return cells, word_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32

View File

@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Set, Union
from typing import Iterable, Optional, Set, Tuple, Union
from docling_core.types.doc import BoundingBox, Size
from PIL import Image
@ -18,7 +18,7 @@ class PdfPageBackend(ABC):
pass
@abstractmethod
def get_text_cells(self) -> Iterable[Cell]:
def get_text_cells(self) -> Tuple[Iterable[Cell], Optional[Iterable[Cell]]]:
pass
@abstractmethod

View File

@ -180,6 +180,7 @@ class Page(BaseModel):
# page_hash: Optional[str] = None
size: Optional[Size] = None
cells: List[Cell] = []
word_cells: List[Cell] = []
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None

View File

@ -53,7 +53,17 @@ class PagePreprocessingModel(BasePageModel):
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
# page.cells = list(page._backend.get_text_cells())
cells_result = page._backend.get_text_cells()
if type(cells_result) is not tuple:
# Backend supports just PDF cells
page.cells = list(cells_result)
else:
# Backend also supports word cells
pdf_cells, word_cells = cells_result
page.cells = list(pdf_cells)
# preserve word_cells for usage in tables
page.word_cells = list(word_cells)
# DEBUG code:
def draw_text_boxes(image, cells, show: bool = False):

View File

@ -122,7 +122,12 @@ class TableStructureModel(BasePageModel):
continue
tokens = []
for c in page.cells:
page_cells = page.cells
if hasattr(page, "word_cells"):
page_cells = page.word_cells
for c in page_cells:
for cluster, _ in in_tables:
if c.bbox.area() > 0:
if (