mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 22:14:37 +00:00
experimental, wip - adding optional word-level cells to page_processing and table model
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
fb8ba861e2
commit
e4383cee56
@ -2,7 +2,7 @@ import logging
|
||||
import random
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -77,21 +77,9 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
def make_cells(self, page_size, cells_data, cells_header):
|
||||
cells: List[Cell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
@ -119,6 +107,28 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
return cells
|
||||
|
||||
def get_text_cells(self) -> Tuple[Iterable[Cell], Iterable[Cell]]:
|
||||
cells: List[Cell] = []
|
||||
word_cells: List[Cell] = []
|
||||
|
||||
if not self.valid:
|
||||
return cells, word_cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
word_cells_data = self._dpage["original"]["cells"]["data"]
|
||||
word_cells_header = self._dpage["original"]["cells"]["header"]
|
||||
|
||||
cells = self.make_cells(page_size, cells_data, cells_header)
|
||||
word_cells = self.make_cells(page_size, word_cells_data, word_cells_header)
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
@ -137,7 +147,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
return cells, word_cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
|
@ -1,7 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Set, Union
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, Size
|
||||
from PIL import Image
|
||||
@ -18,7 +18,7 @@ class PdfPageBackend(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
def get_text_cells(self) -> Tuple[Iterable[Cell], Optional[Iterable[Cell]]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
@ -180,6 +180,7 @@ class Page(BaseModel):
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[Cell] = []
|
||||
word_cells: List[Cell] = []
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
|
||||
|
@ -53,7 +53,17 @@ class PagePreprocessingModel(BasePageModel):
|
||||
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
assert page._backend is not None
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
# page.cells = list(page._backend.get_text_cells())
|
||||
cells_result = page._backend.get_text_cells()
|
||||
if type(cells_result) is not tuple:
|
||||
# Backend supports just PDF cells
|
||||
page.cells = list(cells_result)
|
||||
else:
|
||||
# Backend also supports word cells
|
||||
pdf_cells, word_cells = cells_result
|
||||
page.cells = list(pdf_cells)
|
||||
# preserve word_cells for usage in tables
|
||||
page.word_cells = list(word_cells)
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells, show: bool = False):
|
||||
|
@ -122,7 +122,12 @@ class TableStructureModel(BasePageModel):
|
||||
continue
|
||||
|
||||
tokens = []
|
||||
for c in page.cells:
|
||||
|
||||
page_cells = page.cells
|
||||
if hasattr(page, "word_cells"):
|
||||
page_cells = page.word_cells
|
||||
|
||||
for c in page_cells:
|
||||
for cluster, _ in in_tables:
|
||||
if c.bbox.area() > 0:
|
||||
if (
|
||||
|
Loading…
Reference in New Issue
Block a user