experimental, wip - adding optional word-level cells to page_processing and table model

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-11-13 09:45:23 +01:00
parent fb8ba861e2
commit e4383cee56
5 changed files with 45 additions and 19 deletions

View File

@ -2,7 +2,7 @@ import logging
import random import random
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -77,21 +77,9 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece return text_piece
def get_text_cells(self) -> Iterable[Cell]: def make_cells(self, page_size, cells_data, cells_header):
cells: List[Cell] = [] cells: List[Cell] = []
cell_counter = 0 cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data): for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")] x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")] y0 = cell_data[cells_header.index("y0")]
@ -119,6 +107,28 @@ class DoclingParseV2PageBackend(PdfPageBackend):
) )
) )
cell_counter += 1 cell_counter += 1
return cells
def get_text_cells(self) -> Tuple[Iterable[Cell], Iterable[Cell]]:
cells: List[Cell] = []
word_cells: List[Cell] = []
if not self.valid:
return cells, word_cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
word_cells_data = self._dpage["original"]["cells"]["data"]
word_cells_header = self._dpage["original"]["cells"]["header"]
cells = self.make_cells(page_size, cells_data, cells_header)
word_cells = self.make_cells(page_size, word_cells_data, word_cells_header)
def draw_clusters_and_cells(): def draw_clusters_and_cells():
image = ( image = (
@ -137,7 +147,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
# draw_clusters_and_cells() # draw_clusters_and_cells()
return cells return cells, word_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32 AREA_THRESHOLD = 32 * 32

View File

@ -1,7 +1,7 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Set, Union from typing import Iterable, Optional, Set, Tuple, Union
from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc import BoundingBox, Size
from PIL import Image from PIL import Image
@ -18,7 +18,7 @@ class PdfPageBackend(ABC):
pass pass
@abstractmethod @abstractmethod
def get_text_cells(self) -> Iterable[Cell]: def get_text_cells(self) -> Tuple[Iterable[Cell], Optional[Iterable[Cell]]]:
pass pass
@abstractmethod @abstractmethod

View File

@ -180,6 +180,7 @@ class Page(BaseModel):
# page_hash: Optional[str] = None # page_hash: Optional[str] = None
size: Optional[Size] = None size: Optional[Size] = None
cells: List[Cell] = [] cells: List[Cell] = []
word_cells: List[Cell] = []
predictions: PagePredictions = PagePredictions() predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None

View File

@ -53,7 +53,17 @@ class PagePreprocessingModel(BasePageModel):
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page: def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None assert page._backend is not None
page.cells = list(page._backend.get_text_cells()) # page.cells = list(page._backend.get_text_cells())
cells_result = page._backend.get_text_cells()
if type(cells_result) is not tuple:
# Backend supports just PDF cells
page.cells = list(cells_result)
else:
# Backend also supports word cells
pdf_cells, word_cells = cells_result
page.cells = list(pdf_cells)
# preserve word_cells for usage in tables
page.word_cells = list(word_cells)
# DEBUG code: # DEBUG code:
def draw_text_boxes(image, cells, show: bool = False): def draw_text_boxes(image, cells, show: bool = False):

View File

@ -122,7 +122,12 @@ class TableStructureModel(BasePageModel):
continue continue
tokens = [] tokens = []
for c in page.cells:
page_cells = page.cells
if hasattr(page, "word_cells"):
page_cells = page.word_cells
for c in page_cells:
for cluster, _ in in_tables: for cluster, _ in in_tables:
if c.bbox.area() > 0: if c.bbox.area() > 0:
if ( if (