mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -6,12 +6,12 @@ from typing import Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_parse.pdf_parsers import pdf_parser_v1
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -68,8 +68,11 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
cells: List[Cell] = []
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
@@ -91,19 +94,24 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
Cell(
|
||||
id=cell_counter,
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
bbox=BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
@@ -112,7 +120,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
|
||||
@@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_parse.pdf_parsers import pdf_parser_v2
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell, Size
|
||||
from docling.datamodel.base_models import Size
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
cells: List[Cell] = []
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
@@ -106,16 +110,20 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
Cell(
|
||||
id=cell_counter,
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
bbox=BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
185
docling/backend/docling_parse_v4_backend.py
Normal file
185
docling/backend/docling_parse_v4_backend.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import logging
|
||||
import random
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Size
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
|
||||
self._ppage = page_obj
|
||||
self._dpage = parsed_page
|
||||
self.valid = parsed_page is not None
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
# Find intersecting cells on the page
|
||||
text_piece = ""
|
||||
page_size = self.get_size()
|
||||
|
||||
scale = (
|
||||
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
||||
)
|
||||
|
||||
for i, cell in enumerate(self._dpage.textline_cells):
|
||||
cell_bbox = (
|
||||
cell.rect.to_bounding_box()
|
||||
.to_top_left_origin(page_height=page_size.height)
|
||||
.scaled(scale)
|
||||
)
|
||||
|
||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
||||
|
||||
if overlap_frac > 0.5:
|
||||
if len(text_piece) > 0:
|
||||
text_piece += " "
|
||||
text_piece += cell.text
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return self._dpage
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
page_size = self.get_size()
|
||||
|
||||
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
|
||||
|
||||
# for cell in self._dpage.textline_cells:
|
||||
# rect = cell.rect
|
||||
#
|
||||
# assert (
|
||||
# rect.to_bounding_box().l <= rect.to_bounding_box().r
|
||||
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
|
||||
# assert (
|
||||
# rect.to_bounding_box().t <= rect.to_bounding_box().b
|
||||
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
|
||||
|
||||
return self._dpage.textline_cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
||||
images = self._dpage.bitmap_resources
|
||||
|
||||
for img in images:
|
||||
cropbox = img.rect.to_bounding_box().to_top_left_origin(
|
||||
self.get_size().height
|
||||
)
|
||||
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
yield cropbox
|
||||
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
cropbox = BoundingBox(
|
||||
l=0,
|
||||
r=page_size.width,
|
||||
t=0,
|
||||
b=page_size.height,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
padbox = BoundingBox(
|
||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||
)
|
||||
else:
|
||||
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
||||
padbox.r = page_size.width - padbox.r
|
||||
padbox.t = page_size.height - padbox.t
|
||||
|
||||
image = (
|
||||
self._ppage.render(
|
||||
scale=scale * 1.5,
|
||||
rotation=0, # no additional rotation
|
||||
crop=padbox.as_tuple(),
|
||||
)
|
||||
.to_pil()
|
||||
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
||||
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||
|
||||
return image
|
||||
|
||||
def get_size(self) -> Size:
|
||||
return Size(
|
||||
width=self._dpage.dimension.width,
|
||||
height=self._dpage.dimension.height,
|
||||
)
|
||||
|
||||
def unload(self):
|
||||
self._ppage = None
|
||||
self._dpage = None
|
||||
|
||||
|
||||
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
with pypdfium2_lock:
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
self.parser = DoclingPdfParser(loglevel="fatal")
|
||||
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
|
||||
success = self.dp_doc is not None
|
||||
|
||||
if not success:
|
||||
raise RuntimeError(
|
||||
f"docling-parse v4 could not load document {self.document_hash}."
|
||||
)
|
||||
|
||||
def page_count(self) -> int:
|
||||
# return len(self._pdoc) # To be replaced with docling-parse API
|
||||
|
||||
len_1 = len(self._pdoc)
|
||||
len_2 = self.dp_doc.number_of_pages()
|
||||
|
||||
if len_1 != len_2:
|
||||
_log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
|
||||
|
||||
return len_2
|
||||
|
||||
def load_page(
|
||||
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
||||
) -> DoclingParseV4PageBackend:
|
||||
with pypdfium2_lock:
|
||||
return DoclingParseV4PageBackend(
|
||||
self.dp_doc.get_page(
|
||||
page_no + 1,
|
||||
create_words=create_words,
|
||||
create_textlines=create_textlines,
|
||||
),
|
||||
self._pdoc[page_no],
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.page_count() > 0
|
||||
|
||||
def unload(self):
|
||||
super().unload()
|
||||
self.dp_doc.unload()
|
||||
with pypdfium2_lock:
|
||||
self._pdoc.close()
|
||||
self._pdoc = None
|
||||
@@ -4,10 +4,11 @@ from pathlib import Path
|
||||
from typing import Iterable, Optional, Set, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, Size
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docling.datamodel.base_models import Cell, InputFormat
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
text_piece = self.text_page.get_text_bounded(*rect)
|
||||
x0, y0, x1, y1 = rect
|
||||
cells.append(
|
||||
Cell(
|
||||
id=cell_counter,
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
bbox=BoundingBox(
|
||||
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0,
|
||||
b=y0,
|
||||
r=x1,
|
||||
t=y1,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
||||
# The cell merging code below is to clean this up.
|
||||
def merge_horizontal_cells(
|
||||
cells: List[Cell],
|
||||
cells: List[TextCell],
|
||||
horizontal_threshold_factor: float = 1.0,
|
||||
vertical_threshold_factor: float = 0.5,
|
||||
) -> List[Cell]:
|
||||
) -> List[TextCell]:
|
||||
if not cells:
|
||||
return []
|
||||
|
||||
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
|
||||
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
|
||||
rows = []
|
||||
current_row = [cells[0]]
|
||||
row_top = cells[0].bbox.t
|
||||
row_bottom = cells[0].bbox.b
|
||||
row_height = cells[0].bbox.height
|
||||
row_top = cells[0].rect.to_bounding_box().t
|
||||
row_bottom = cells[0].rect.to_bounding_box().b
|
||||
row_height = cells[0].rect.to_bounding_box().height
|
||||
|
||||
for cell in cells[1:]:
|
||||
vertical_threshold = row_height * vertical_threshold_factor
|
||||
if (
|
||||
abs(cell.bbox.t - row_top) <= vertical_threshold
|
||||
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
|
||||
abs(cell.rect.to_bounding_box().t - row_top)
|
||||
<= vertical_threshold
|
||||
and abs(cell.rect.to_bounding_box().b - row_bottom)
|
||||
<= vertical_threshold
|
||||
):
|
||||
current_row.append(cell)
|
||||
row_top = min(row_top, cell.bbox.t)
|
||||
row_bottom = max(row_bottom, cell.bbox.b)
|
||||
row_top = min(row_top, cell.rect.to_bounding_box().t)
|
||||
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
|
||||
row_height = row_bottom - row_top
|
||||
else:
|
||||
rows.append(current_row)
|
||||
current_row = [cell]
|
||||
row_top = cell.bbox.t
|
||||
row_bottom = cell.bbox.b
|
||||
row_height = cell.bbox.height
|
||||
row_top = cell.rect.to_bounding_box().t
|
||||
row_bottom = cell.rect.to_bounding_box().b
|
||||
row_height = cell.rect.to_bounding_box().height
|
||||
|
||||
if current_row:
|
||||
rows.append(current_row)
|
||||
|
||||
return rows
|
||||
|
||||
def merge_row(row: List[Cell]) -> List[Cell]:
|
||||
def merge_row(row: List[TextCell]) -> List[TextCell]:
|
||||
merged = []
|
||||
current_group = [row[0]]
|
||||
|
||||
for cell in row[1:]:
|
||||
prev_cell = current_group[-1]
|
||||
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
|
||||
avg_height = (
|
||||
prev_cell.rect.height + cell.rect.to_bounding_box().height
|
||||
) / 2
|
||||
if (
|
||||
cell.bbox.l - prev_cell.bbox.r
|
||||
cell.rect.to_bounding_box().l
|
||||
- prev_cell.rect.to_bounding_box().r
|
||||
<= avg_height * horizontal_threshold_factor
|
||||
):
|
||||
current_group.append(cell)
|
||||
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return merged
|
||||
|
||||
def merge_group(group: List[Cell]) -> Cell:
|
||||
def merge_group(group: List[TextCell]) -> TextCell:
|
||||
if len(group) == 1:
|
||||
return group[0]
|
||||
|
||||
merged_text = "".join(cell.text for cell in group)
|
||||
merged_bbox = BoundingBox(
|
||||
l=min(cell.bbox.l for cell in group),
|
||||
t=min(cell.bbox.t for cell in group),
|
||||
r=max(cell.bbox.r for cell in group),
|
||||
b=max(cell.bbox.b for cell in group),
|
||||
l=min(cell.rect.to_bounding_box().l for cell in group),
|
||||
t=min(cell.rect.to_bounding_box().t for cell in group),
|
||||
r=max(cell.rect.to_bounding_box().r for cell in group),
|
||||
b=max(cell.rect.to_bounding_box().b for cell in group),
|
||||
)
|
||||
return TextCell(
|
||||
index=group[0].index,
|
||||
text=merged_text,
|
||||
orig=merged_text,
|
||||
rect=BoundingRectangle.from_bounding_box(merged_bbox),
|
||||
from_ocr=False,
|
||||
)
|
||||
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
||||
|
||||
rows = group_rows(cells)
|
||||
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
||||
|
||||
for i, cell in enumerate(merged_cells, 1):
|
||||
cell.id = i
|
||||
cell.index = i
|
||||
|
||||
return merged_cells
|
||||
|
||||
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
|
||||
@@ -16,6 +16,7 @@ from pydantic import TypeAdapter
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
@@ -412,12 +413,15 @@ def convert(
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
backend: Type[PdfDocumentBackend]
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
backend = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
||||
backend = DoclingParseV4DocumentBackend # type: ignore
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
backend = PyPdfiumDocumentBackend # type: ignore
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
DocumentStream,
|
||||
)
|
||||
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
|
||||
error_message: str
|
||||
|
||||
|
||||
class Cell(BaseModel):
|
||||
id: int
|
||||
text: str
|
||||
bbox: BoundingBox
|
||||
|
||||
|
||||
class OcrCell(Cell):
|
||||
confidence: float
|
||||
# class Cell(BaseModel):
|
||||
# id: int
|
||||
# text: str
|
||||
# bbox: BoundingBox
|
||||
|
||||
|
||||
class Cluster(BaseModel):
|
||||
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
|
||||
label: DocItemLabel
|
||||
bbox: BoundingBox
|
||||
confidence: float = 1.0
|
||||
cells: List[Cell] = []
|
||||
cells: List[TextCell] = []
|
||||
children: List["Cluster"] = [] # Add child cluster support
|
||||
|
||||
|
||||
@@ -226,7 +223,8 @@ class Page(BaseModel):
|
||||
page_no: int
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[Cell] = []
|
||||
cells: List[TextCell] = []
|
||||
parsed_page: Optional[SegmentedPdfPage] = None
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
|
||||
|
||||
@@ -301,6 +301,7 @@ class PdfBackend(str, Enum):
|
||||
PYPDFIUM2 = "pypdfium2"
|
||||
DLPARSE_V1 = "dlparse_v1"
|
||||
DLPARSE_V2 = "dlparse_v2"
|
||||
DLPARSE_V4 = "dlparse_v4"
|
||||
|
||||
|
||||
# Define an enum for the ocr engines
|
||||
@@ -381,3 +382,5 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
"before conversion and then use the `TableItem.get_image` function."
|
||||
),
|
||||
)
|
||||
|
||||
generate_parsed_pages: bool = False
|
||||
|
||||
@@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.csv_backend import CsvDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
@@ -109,12 +109,12 @@ class XMLJatsFormatOption(FormatOption):
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
||||
|
||||
|
||||
class PdfFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
||||
|
||||
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
@@ -147,10 +147,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
||||
),
|
||||
InputFormat.JSON_DOCLING: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||
|
||||
@@ -6,11 +6,12 @@ from typing import Iterable, List
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import binary_dilation, find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
|
||||
p.dimension = 2
|
||||
idx = index.Index(properties=p)
|
||||
for i, cell in enumerate(programmatic_cells):
|
||||
idx.insert(i, cell.bbox.as_tuple())
|
||||
idx.insert(i, cell.rect.to_bounding_box().as_tuple())
|
||||
|
||||
def is_overlapping_with_existing_cells(ocr_cell):
|
||||
# Query the R-tree to get overlapping rectangles
|
||||
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
|
||||
possible_matches_index = list(
|
||||
idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
|
||||
)
|
||||
|
||||
return (
|
||||
len(possible_matches_index) > 0
|
||||
@@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
|
||||
"""
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
cells = [
|
||||
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||
for c_ocr in ocr_cells
|
||||
]
|
||||
cells = ocr_cells
|
||||
return cells
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
@@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):
|
||||
|
||||
# Draw OCR and programmatic cells
|
||||
for tc in page.cells:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
|
||||
y0 *= scale_x
|
||||
y1 *= scale_y
|
||||
x0 *= scale_x
|
||||
@@ -165,9 +165,8 @@ class BaseOcrModel(BasePageModel):
|
||||
if y1 <= y0:
|
||||
y1, y0 = y0, y1
|
||||
|
||||
color = "gray"
|
||||
if isinstance(tc, OcrCell):
|
||||
color = "magenta"
|
||||
color = "magenta" if tc.from_ocr else "gray"
|
||||
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
||||
|
||||
if show:
|
||||
|
||||
@@ -6,8 +6,9 @@ from typing import Iterable, List, Optional
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@@ -148,18 +149,22 @@ class EasyOcrModel(BaseOcrModel):
|
||||
del im
|
||||
|
||||
cells = [
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=line[1],
|
||||
orig=line[1],
|
||||
from_ocr=True,
|
||||
confidence=line[2],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
|
||||
@@ -3,8 +3,9 @@ import tempfile
|
||||
from typing import Iterable, Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -94,13 +95,17 @@ class OcrMacModel(BaseOcrModel):
|
||||
bottom = y2 / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=text,
|
||||
orig=text,
|
||||
from_ocr=True,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
images_scale: Optional[float]
|
||||
create_parsed_page: bool
|
||||
|
||||
|
||||
class PagePreprocessingModel(BasePageModel):
|
||||
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
if self.options.create_parsed_page:
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells, show: bool = False):
|
||||
draw = ImageDraw.Draw(image)
|
||||
|
||||
@@ -3,8 +3,9 @@ from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@@ -100,18 +101,26 @@ class RapidOcrModel(BaseOcrModel):
|
||||
|
||||
if result is not None:
|
||||
cells = [
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=line[1],
|
||||
orig=line[1],
|
||||
confidence=line[2],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
from_ocr=True,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(line[0][0][0] / self.scale)
|
||||
+ ocr_rect.l,
|
||||
(line[0][0][1] / self.scale)
|
||||
+ ocr_rect.t,
|
||||
(line[0][2][0] / self.scale)
|
||||
+ ocr_rect.l,
|
||||
(line[0][2][1] / self.scale)
|
||||
+ ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
|
||||
for cell in table_element.cluster.cells:
|
||||
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
||||
x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
|
||||
x0 *= scale_x
|
||||
x1 *= scale_x
|
||||
y0 *= scale_x
|
||||
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
new_cell = copy.deepcopy(c)
|
||||
new_cell.bbox = new_cell.bbox.scaled(
|
||||
scale=self.scale
|
||||
new_cell.rect = BoundingRectangle.from_bounding_box(
|
||||
new_cell.rect.to_bounding_box().scaled(
|
||||
scale=self.scale
|
||||
)
|
||||
)
|
||||
|
||||
tokens.append(new_cell.model_dump())
|
||||
tokens.append(
|
||||
{
|
||||
"id": new_cell.index,
|
||||
"text": new_cell.text,
|
||||
"bbox": new_cell.rect.to_bounding_box().model_dump(),
|
||||
}
|
||||
)
|
||||
page_input["tokens"] = tokens
|
||||
|
||||
tf_output = self.tf_predictor.multi_table_predict(
|
||||
|
||||
@@ -8,8 +8,9 @@ from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -228,18 +229,22 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
t = b + h
|
||||
r = l + w
|
||||
|
||||
cell = OcrCell(
|
||||
id=ix,
|
||||
cell = TextCell(
|
||||
index=ix,
|
||||
text=text,
|
||||
orig=text,
|
||||
from_ocr=True,
|
||||
confidence=conf / 100.0,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
@@ -2,8 +2,9 @@ import logging
|
||||
from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@@ -173,13 +174,17 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
top = (box["y"] + box["h"]) / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
TextCell(
|
||||
index=ix,
|
||||
text=text,
|
||||
orig=text,
|
||||
from_ocr=True,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -87,7 +87,8 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
# Pre-processing
|
||||
PagePreprocessingModel(
|
||||
options=PagePreprocessingOptions(
|
||||
images_scale=pipeline_options.images_scale
|
||||
images_scale=pipeline_options.images_scale,
|
||||
create_parsed_page=pipeline_options.generate_parsed_pages,
|
||||
)
|
||||
),
|
||||
# OCR
|
||||
|
||||
@@ -2,9 +2,9 @@ import logging
|
||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||
|
||||
from docling.datamodel.base_models import OcrCell
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
|
||||
if page.size is None:
|
||||
return cells
|
||||
for cell in page.cells:
|
||||
new_bbox = cell.bbox.to_top_left_origin(
|
||||
page_height=page.size.height
|
||||
).normalized(page_size=page.size)
|
||||
is_ocr = isinstance(cell, OcrCell)
|
||||
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
|
||||
new_bbox = (
|
||||
cell.rect.to_bounding_box()
|
||||
.to_top_left_origin(page_height=page.size.height)
|
||||
.normalized(page_size=page.size)
|
||||
)
|
||||
is_ocr = cell.from_ocr
|
||||
ocr_confidence = cell.confidence
|
||||
cells.append(
|
||||
{
|
||||
"text": cell.text,
|
||||
|
||||
@@ -5,9 +5,10 @@ from collections import defaultdict
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, Size
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from rtree import index
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -198,7 +199,7 @@ class LayoutPostprocessor:
|
||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||
}
|
||||
|
||||
def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
|
||||
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
|
||||
"""Initialize processor with cells and clusters."""
|
||||
"""Initialize processor with cells and spatial indices."""
|
||||
self.cells = cells
|
||||
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
|
||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||
)
|
||||
|
||||
def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
|
||||
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||
"""Main processing pipeline."""
|
||||
self.regular_clusters = self._process_regular_clusters()
|
||||
self.special_clusters = self._process_special_clusters()
|
||||
@@ -271,15 +272,13 @@ class LayoutPostprocessor:
|
||||
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
||||
orphan_clusters = []
|
||||
for i, cell in enumerate(unassigned):
|
||||
conf = 1.0
|
||||
if isinstance(cell, OcrCell):
|
||||
conf = cell.confidence
|
||||
conf = cell.confidence
|
||||
|
||||
orphan_clusters.append(
|
||||
Cluster(
|
||||
id=next_id + i,
|
||||
label=DocItemLabel.TEXT,
|
||||
bbox=cell.bbox,
|
||||
bbox=cell.to_bounding_box(),
|
||||
confidence=conf,
|
||||
cells=[cell],
|
||||
)
|
||||
@@ -557,13 +556,13 @@ class LayoutPostprocessor:
|
||||
|
||||
return current_best if current_best else clusters[0]
|
||||
|
||||
def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
|
||||
def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
||||
seen_ids = set()
|
||||
unique_cells = []
|
||||
for cell in cells:
|
||||
if cell.id not in seen_ids:
|
||||
seen_ids.add(cell.id)
|
||||
if cell.index not in seen_ids:
|
||||
seen_ids.add(cell.index)
|
||||
unique_cells.append(cell)
|
||||
return unique_cells
|
||||
|
||||
@@ -582,11 +581,13 @@ class LayoutPostprocessor:
|
||||
best_cluster = None
|
||||
|
||||
for cluster in clusters:
|
||||
if cell.bbox.area() <= 0:
|
||||
if cell.rect.to_bounding_box().area() <= 0:
|
||||
continue
|
||||
|
||||
overlap = cell.bbox.intersection_area_with(cluster.bbox)
|
||||
overlap_ratio = overlap / cell.bbox.area()
|
||||
overlap = cell.rect.to_bounding_box().intersection_area_with(
|
||||
cluster.bbox
|
||||
)
|
||||
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
||||
|
||||
if overlap_ratio > best_overlap:
|
||||
best_overlap = overlap_ratio
|
||||
@@ -601,11 +602,13 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
|
||||
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
|
||||
"""Find cells not assigned to any cluster."""
|
||||
assigned = {cell.id for cluster in clusters for cell in cluster.cells}
|
||||
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
|
||||
return [
|
||||
cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
|
||||
cell
|
||||
for cell in self.cells
|
||||
if cell.index not in assigned and cell.text.strip()
|
||||
]
|
||||
|
||||
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
||||
@@ -615,10 +618,10 @@ class LayoutPostprocessor:
|
||||
continue
|
||||
|
||||
cells_bbox = BoundingBox(
|
||||
l=min(cell.bbox.l for cell in cluster.cells),
|
||||
t=min(cell.bbox.t for cell in cluster.cells),
|
||||
r=max(cell.bbox.r for cell in cluster.cells),
|
||||
b=max(cell.bbox.b for cell in cluster.cells),
|
||||
l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
|
||||
t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
|
||||
r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
|
||||
b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
|
||||
)
|
||||
|
||||
if cluster.label == DocItemLabel.TABLE:
|
||||
@@ -634,9 +637,9 @@ class LayoutPostprocessor:
|
||||
|
||||
return clusters
|
||||
|
||||
def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
|
||||
def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
|
||||
"""Sort cells in native reading order."""
|
||||
return sorted(cells, key=lambda c: (c.id))
|
||||
return sorted(cells, key=lambda c: (c.index))
|
||||
|
||||
def _sort_clusters(
|
||||
self, clusters: List[Cluster], mode: str = "id"
|
||||
@@ -647,7 +650,7 @@ class LayoutPostprocessor:
|
||||
clusters,
|
||||
key=lambda cluster: (
|
||||
(
|
||||
min(cell.id for cell in cluster.cells)
|
||||
min(cell.index for cell in cluster.cells)
|
||||
if cluster.cells
|
||||
else sys.maxsize
|
||||
),
|
||||
|
||||
@@ -25,7 +25,7 @@ def draw_clusters(
|
||||
# Draw cells first (underneath)
|
||||
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
||||
for tc in c.cells:
|
||||
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
||||
cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
|
||||
cx0 *= scale_x
|
||||
cx1 *= scale_x
|
||||
cy0 *= scale_x
|
||||
|
||||
Reference in New Issue
Block a user