mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Make page.parsed_page the only source of truth for text cells
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
e310c5cff3
commit
d73c9a2995
@ -7,7 +7,13 @@ from typing import List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
PdfPageBoundaryType,
|
||||
PdfPageGeometry,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from docling_parse.pdf_parsers import pdf_parser_v1
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
@ -36,6 +42,51 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from docling-parse data."""
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
cell_counter += 1
|
||||
|
||||
return cells
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.valid:
|
||||
return ""
|
||||
@ -70,75 +121,45 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
return None
|
||||
|
||||
page_size = self.get_size()
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
# Create page geometry
|
||||
crop_bbox = BoundingBox(
|
||||
l=0,
|
||||
r=page_size.width,
|
||||
t=0,
|
||||
b=page_size.height,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
).to_bottom_left_origin(page_size.height)
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
dimension = PdfPageGeometry(
|
||||
angle=0.0,
|
||||
rect=BoundingRectangle.from_bounding_box(crop_bbox),
|
||||
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
||||
art_bbox=crop_bbox,
|
||||
bleed_bbox=crop_bbox,
|
||||
crop_bbox=crop_bbox,
|
||||
media_bbox=crop_bbox,
|
||||
trim_bbox=crop_bbox,
|
||||
)
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_lines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
# cells = merge_horizontal_cells(cells)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
@ -7,7 +7,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
PdfPageBoundaryType,
|
||||
PdfPageGeometry,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from docling_parse.pdf_parsers import pdf_parser_v2
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
@ -40,6 +46,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from docling-parse v2 data."""
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
return cells
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.valid:
|
||||
return ""
|
||||
@ -81,73 +136,45 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
return None
|
||||
|
||||
page_size = self.get_size()
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
# Create page geometry
|
||||
crop_bbox = BoundingBox(
|
||||
l=0,
|
||||
r=page_size.width,
|
||||
t=0,
|
||||
b=page_size.height,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
).to_bottom_left_origin(page_size.height)
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
dimension = PdfPageGeometry(
|
||||
angle=0.0,
|
||||
rect=BoundingRectangle.from_bounding_box(crop_bbox),
|
||||
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
||||
art_bbox=crop_bbox,
|
||||
bleed_bbox=crop_bbox,
|
||||
crop_bbox=crop_bbox,
|
||||
media_bbox=crop_bbox,
|
||||
trim_bbox=crop_bbox,
|
||||
)
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_lines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
PdfPageBoundaryType,
|
||||
PdfPageGeometry,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
@ -41,38 +47,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
page_size = self.get_size()
|
||||
with pypdfium2_lock:
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height=page_size.height)
|
||||
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
yield cropbox
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||
|
||||
with pypdfium2_lock:
|
||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from pypdfium."""
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
@ -203,30 +179,76 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return merged_cells
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
return merge_horizontal_cells(cells)
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
page_size = self.get_size()
|
||||
with pypdfium2_lock:
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height=page_size.height)
|
||||
|
||||
cells = merge_horizontal_cells(cells)
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
yield cropbox
|
||||
|
||||
return cells
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||
|
||||
with pypdfium2_lock:
|
||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
if not self.valid:
|
||||
return None
|
||||
|
||||
page_size = self.get_size()
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
# Create page geometry
|
||||
crop_bbox = BoundingBox(
|
||||
l=0,
|
||||
r=page_size.width,
|
||||
t=0,
|
||||
b=page_size.height,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
).to_bottom_left_origin(page_size.height)
|
||||
|
||||
dimension = PdfPageGeometry(
|
||||
angle=0.0,
|
||||
rect=BoundingRectangle.from_bounding_box(crop_bbox),
|
||||
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
||||
art_bbox=crop_bbox,
|
||||
bleed_bbox=crop_bbox,
|
||||
crop_bbox=crop_bbox,
|
||||
media_bbox=crop_bbox,
|
||||
trim_bbox=crop_bbox,
|
||||
)
|
||||
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_lines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
|
@ -232,7 +232,6 @@ class Page(BaseModel):
|
||||
page_no: int
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[TextCell] = []
|
||||
parsed_page: Optional[SegmentedPdfPage] = None
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
@ -245,6 +244,14 @@ class Page(BaseModel):
|
||||
float, Image
|
||||
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
||||
|
||||
@property
|
||||
def cells(self) -> List[TextCell]:
|
||||
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
||||
if self.parsed_page is not None:
|
||||
return self.parsed_page.textline_cells
|
||||
else:
|
||||
return []
|
||||
|
||||
def get_image(
|
||||
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
||||
) -> Optional[Image]:
|
||||
|
@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
),
|
||||
)
|
||||
|
||||
generate_parsed_pages: bool = False
|
||||
generate_parsed_pages: Literal[True] = (
|
||||
True # Always True since parsed_page is now mandatory
|
||||
)
|
||||
|
||||
|
||||
class PdfPipeline(str, Enum):
|
||||
|
@ -133,20 +133,19 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
||||
def post_process_cells(self, ocr_cells, page):
|
||||
r"""
|
||||
Post-process the OCR cells and update the page object.
|
||||
Treats page.parsed_page as authoritative when available, with page.cells for compatibility.
|
||||
Updates parsed_page.textline_cells directly since page.cells is now read-only.
|
||||
"""
|
||||
# Get existing cells (prefer parsed_page, fallback to page.cells)
|
||||
existing_cells = self._get_existing_cells(page)
|
||||
# Get existing cells from the read-only property
|
||||
existing_cells = page.cells
|
||||
|
||||
# Combine existing and OCR cells with overlap filtering
|
||||
final_cells = self._combine_cells(existing_cells, ocr_cells)
|
||||
|
||||
# Update both structures efficiently
|
||||
self._update_page_structures(page, final_cells)
|
||||
assert page.parsed_page is not None
|
||||
|
||||
def _get_existing_cells(self, page):
|
||||
"""Get existing cells, preferring parsed_page when available."""
|
||||
return page.parsed_page.textline_cells if page.parsed_page else page.cells
|
||||
# Update parsed_page.textline_cells directly
|
||||
page.parsed_page.textline_cells = final_cells
|
||||
page.parsed_page.has_lines = bool(final_cells)
|
||||
|
||||
def _combine_cells(self, existing_cells, ocr_cells):
|
||||
"""Combine existing and OCR cells with filtering and re-indexing."""
|
||||
@ -162,18 +161,6 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
||||
|
||||
return combined
|
||||
|
||||
def _update_page_structures(self, page, final_cells):
|
||||
"""Update both page structures efficiently."""
|
||||
if page.parsed_page:
|
||||
# Update parsed_page as primary source
|
||||
page.parsed_page.textline_cells = final_cells
|
||||
page.parsed_page.has_lines = bool(final_cells)
|
||||
# Sync to page.cells for compatibility
|
||||
page.cells = final_cells
|
||||
else:
|
||||
# Legacy fallback: only page.cells available
|
||||
page.cells = final_cells
|
||||
|
||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||
image = copy.deepcopy(page.image)
|
||||
scale_x = image.width / page.size.width
|
||||
|
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
|
||||
)
|
||||
)
|
||||
|
||||
# page.cells is already updated by LayoutPostprocessor
|
||||
page.predictions.layout = LayoutPrediction(
|
||||
clusters=processed_clusters
|
||||
)
|
||||
|
@ -2,7 +2,7 @@ import re
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
from PIL import ImageDraw
|
||||
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
images_scale: Optional[float]
|
||||
create_parsed_page: bool
|
||||
|
||||
|
||||
class PagePreprocessingModel(BasePageModel):
|
||||
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
|
||||
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
assert page._backend is not None
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
if self.options.create_parsed_page:
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
assert page.parsed_page is not None
|
||||
|
||||
# Rate the text quality from the PDF parser, and aggregate on page
|
||||
text_scores = []
|
||||
|
@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
PagePreprocessingModel(
|
||||
options=PagePreprocessingOptions(
|
||||
images_scale=pipeline_options.images_scale,
|
||||
create_parsed_page=pipeline_options.generate_parsed_pages,
|
||||
)
|
||||
),
|
||||
# OCR
|
||||
|
@ -196,8 +196,7 @@ class LayoutPostprocessor:
|
||||
|
||||
def __init__(self, page, clusters: List[Cluster]):
|
||||
"""Initialize processor with page and clusters."""
|
||||
# Get cells from best available source (prefer parsed_page)
|
||||
self.cells = self._get_page_cells(page)
|
||||
self.cells = page.cells
|
||||
self.page = page
|
||||
self.page_size = page.size
|
||||
self.all_clusters = clusters
|
||||
@ -215,24 +214,6 @@ class LayoutPostprocessor:
|
||||
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
||||
)
|
||||
|
||||
def _get_page_cells(self, page):
|
||||
"""Get cells from best available source (prefer parsed_page)."""
|
||||
return (
|
||||
page.parsed_page.textline_cells
|
||||
if page.parsed_page is not None
|
||||
else page.cells
|
||||
)
|
||||
|
||||
def _update_page_structures(self, final_cells):
|
||||
"""Update both page structures efficiently."""
|
||||
if self.page.parsed_page is not None:
|
||||
# Update parsed_page as primary source
|
||||
self.page.parsed_page.textline_cells = final_cells
|
||||
self.page.parsed_page.has_lines = len(final_cells) > 0
|
||||
|
||||
# Legacy fallback: only page.cells available
|
||||
self.page.cells = final_cells
|
||||
|
||||
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
|
||||
"""Main processing pipeline."""
|
||||
self.regular_clusters = self._process_regular_clusters()
|
||||
@ -259,8 +240,9 @@ class LayoutPostprocessor:
|
||||
for child in cluster.children:
|
||||
child.cells = self._sort_cells(child.cells)
|
||||
|
||||
# Update page structures with processed cells
|
||||
self._update_page_structures(self.cells)
|
||||
assert self.page.parsed_page is not None
|
||||
self.page.parsed_page.textline_cells = self.cells
|
||||
self.page.parsed_page.has_lines = len(self.cells) > 0
|
||||
|
||||
return final_clusters, self.cells
|
||||
|
||||
|
101474
tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
vendored
101474
tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
vendored
File diff suppressed because it is too large
Load Diff
89985
tests/data/groundtruth/docling_v1/2206.01062.pages.json
vendored
89985
tests/data/groundtruth/docling_v1/2206.01062.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
56232
tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
vendored
56232
tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
9633
tests/data/groundtruth/docling_v1/multi_page.pages.json
vendored
9633
tests/data/groundtruth/docling_v1/multi_page.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
101474
tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
vendored
101474
tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
vendored
File diff suppressed because it is too large
Load Diff
89985
tests/data/groundtruth/docling_v2/2206.01062.pages.json
vendored
89985
tests/data/groundtruth/docling_v2/2206.01062.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
56232
tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
vendored
56232
tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
9633
tests/data/groundtruth/docling_v2/multi_page.pages.json
vendored
9633
tests/data/groundtruth/docling_v2/multi_page.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
69.6796630536824,
|
||||
689.0124221922704,
|
||||
504.8720051760782,
|
||||
764.9216921155637
|
||||
72.33333333333333,
|
||||
691.5883585611979,
|
||||
503.3333333333333,
|
||||
763.9216918945312
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
|
||||
</document>
|
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
441.2561096985719,
|
||||
131.89488404865142,
|
||||
522.0347860494834,
|
||||
151.87873262042876
|
||||
444.6666666666667,
|
||||
131.58835856119788,
|
||||
521.6666666666666,
|
||||
150.25502522786462
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
@ -67,10 +67,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
89.23887497045128,
|
||||
77.02339852098021,
|
||||
523.208764293368,
|
||||
124.75312428291147
|
||||
92.0,
|
||||
77.92169189453125,
|
||||
523.0,
|
||||
123.25502522786462
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
|
||||
</document>
|
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
690.2441821046808,
|
||||
442.39487414368364,
|
||||
709.8255852011977,
|
||||
523.076601235155
|
||||
691.6666666666666,
|
||||
444.53450520833337,
|
||||
710.3333333333334,
|
||||
521.5345052083334
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
|
||||
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
|
||||
</document>
|
@ -44,10 +44,10 @@
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
131.21306574279092,
|
||||
74.12495603322407,
|
||||
152.19606490864376,
|
||||
154.19400205373182
|
||||
131.66666666666666,
|
||||
73.53450520833337,
|
||||
150.33333333333334,
|
||||
150.53450520833331
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,2 +1,2 @@
|
||||
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
<doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
</doctag>
|
@ -42,10 +42,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 69.6796630536824,
|
||||
"t": 764.9216921155637,
|
||||
"r": 504.8720051760782,
|
||||
"b": 689.0124221922704,
|
||||
"l": 72.33333333333333,
|
||||
"t": 763.9216918945312,
|
||||
"r": 503.3333333333333,
|
||||
"b": 691.5883585611979,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text>
|
||||
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||
<doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
|
||||
<text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
|
||||
</doctag>
|
@ -45,10 +45,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 441.2561096985719,
|
||||
"t": 151.87873262042876,
|
||||
"r": 522.0347860494834,
|
||||
"b": 131.89488404865142,
|
||||
"l": 444.6666666666667,
|
||||
"t": 150.25502522786462,
|
||||
"r": 521.6666666666666,
|
||||
"b": 131.58835856119788,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
@ -74,10 +74,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 89.23887497045128,
|
||||
"t": 124.75312428291147,
|
||||
"r": 523.208764293368,
|
||||
"b": 77.02339852098021,
|
||||
"l": 92.0,
|
||||
"t": 123.25502522786462,
|
||||
"r": 523.0,
|
||||
"b": 77.92169189453125,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_410><loc_61><loc_422><loc_128>package</text>
|
||||
<doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_411><loc_62><loc_422><loc_127>package</text>
|
||||
</doctag>
|
@ -45,10 +45,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 717.168585936602,
|
||||
"t": 524.2990550512769,
|
||||
"r": 764.8982839673505,
|
||||
"b": 90.3291657283603,
|
||||
"l": 718.6666666666666,
|
||||
"t": 522.8678385416666,
|
||||
"r": 764.0,
|
||||
"b": 91.86783854166669,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
@ -74,10 +74,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 690.2441821046808,
|
||||
"t": 523.076601235155,
|
||||
"r": 709.8255852011977,
|
||||
"b": 442.39487414368364,
|
||||
"l": 691.6666666666666,
|
||||
"t": 521.5345052083334,
|
||||
"r": 710.3333333333334,
|
||||
"b": 444.53450520833337,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,3 @@
|
||||
<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_78><loc_370><loc_90><loc_438>package</text>
|
||||
<doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
|
||||
<text><loc_78><loc_374><loc_89><loc_438>package</text>
|
||||
</doctag>
|
@ -45,10 +45,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 77.10171545548258,
|
||||
"t": 506.0744964609271,
|
||||
"r": 126.08064862014129,
|
||||
"b": 71.87755635676046,
|
||||
"l": 78.0,
|
||||
"t": 503.201171875,
|
||||
"r": 123.33333333333333,
|
||||
"b": 72.201171875,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
@ -74,10 +74,10 @@
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 131.21306574279092,
|
||||
"t": 154.19400205373182,
|
||||
"r": 152.19606490864376,
|
||||
"b": 74.12495603322407,
|
||||
"l": 131.66666666666666,
|
||||
"t": 150.53450520833331,
|
||||
"r": 150.33333333333334,
|
||||
"b": 73.53450520833337,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user