mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
experimental, wip - adding optional word-level cells to page_processing and table model
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
fb8ba861e2
commit
e4383cee56
@ -2,7 +2,7 @@ import logging
|
|||||||
import random
|
import random
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -77,21 +77,9 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
return text_piece
|
return text_piece
|
||||||
|
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
def make_cells(self, page_size, cells_data, cells_header):
|
||||||
cells: List[Cell] = []
|
cells: List[Cell] = []
|
||||||
cell_counter = 0
|
cell_counter = 0
|
||||||
|
|
||||||
if not self.valid:
|
|
||||||
return cells
|
|
||||||
|
|
||||||
page_size = self.get_size()
|
|
||||||
|
|
||||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
|
||||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
|
||||||
|
|
||||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
|
||||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
|
||||||
|
|
||||||
for i, cell_data in enumerate(cells_data):
|
for i, cell_data in enumerate(cells_data):
|
||||||
x0 = cell_data[cells_header.index("x0")]
|
x0 = cell_data[cells_header.index("x0")]
|
||||||
y0 = cell_data[cells_header.index("y0")]
|
y0 = cell_data[cells_header.index("y0")]
|
||||||
@ -119,6 +107,28 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
cell_counter += 1
|
cell_counter += 1
|
||||||
|
return cells
|
||||||
|
|
||||||
|
def get_text_cells(self) -> Tuple[Iterable[Cell], Iterable[Cell]]:
|
||||||
|
cells: List[Cell] = []
|
||||||
|
word_cells: List[Cell] = []
|
||||||
|
|
||||||
|
if not self.valid:
|
||||||
|
return cells, word_cells
|
||||||
|
|
||||||
|
page_size = self.get_size()
|
||||||
|
|
||||||
|
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||||
|
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||||
|
|
||||||
|
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||||
|
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||||
|
|
||||||
|
word_cells_data = self._dpage["original"]["cells"]["data"]
|
||||||
|
word_cells_header = self._dpage["original"]["cells"]["header"]
|
||||||
|
|
||||||
|
cells = self.make_cells(page_size, cells_data, cells_header)
|
||||||
|
word_cells = self.make_cells(page_size, word_cells_data, word_cells_header)
|
||||||
|
|
||||||
def draw_clusters_and_cells():
|
def draw_clusters_and_cells():
|
||||||
image = (
|
image = (
|
||||||
@ -137,7 +147,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
# draw_clusters_and_cells()
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
return cells
|
return cells, word_cells
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 32 * 32
|
AREA_THRESHOLD = 32 * 32
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Set, Union
|
from typing import Iterable, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, Size
|
from docling_core.types.doc import BoundingBox, Size
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -18,7 +18,7 @@ class PdfPageBackend(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
def get_text_cells(self) -> Tuple[Iterable[Cell], Optional[Iterable[Cell]]]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -180,6 +180,7 @@ class Page(BaseModel):
|
|||||||
# page_hash: Optional[str] = None
|
# page_hash: Optional[str] = None
|
||||||
size: Optional[Size] = None
|
size: Optional[Size] = None
|
||||||
cells: List[Cell] = []
|
cells: List[Cell] = []
|
||||||
|
word_cells: List[Cell] = []
|
||||||
predictions: PagePredictions = PagePredictions()
|
predictions: PagePredictions = PagePredictions()
|
||||||
assembled: Optional[AssembledUnit] = None
|
assembled: Optional[AssembledUnit] = None
|
||||||
|
|
||||||
|
@ -53,7 +53,17 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
|
|
||||||
page.cells = list(page._backend.get_text_cells())
|
# page.cells = list(page._backend.get_text_cells())
|
||||||
|
cells_result = page._backend.get_text_cells()
|
||||||
|
if type(cells_result) is not tuple:
|
||||||
|
# Backend supports just PDF cells
|
||||||
|
page.cells = list(cells_result)
|
||||||
|
else:
|
||||||
|
# Backend also supports word cells
|
||||||
|
pdf_cells, word_cells = cells_result
|
||||||
|
page.cells = list(pdf_cells)
|
||||||
|
# preserve word_cells for usage in tables
|
||||||
|
page.word_cells = list(word_cells)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
|
@ -122,7 +122,12 @@ class TableStructureModel(BasePageModel):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
for c in page.cells:
|
|
||||||
|
page_cells = page.cells
|
||||||
|
if hasattr(page, "word_cells"):
|
||||||
|
page_cells = page.word_cells
|
||||||
|
|
||||||
|
for c in page_cells:
|
||||||
for cluster, _ in in_tables:
|
for cluster, _ in in_tables:
|
||||||
if c.bbox.area() > 0:
|
if c.bbox.area() > 0:
|
||||||
if (
|
if (
|
||||||
|
Loading…
Reference in New Issue
Block a user