Make page.parsed_page the only source of truth for text cells

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-10 19:55:49 +02:00
parent e310c5cff3
commit d73c9a2995
58 changed files with 349497 additions and 331004 deletions

View File

@ -7,7 +7,13 @@ from typing import List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v1 from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
@ -36,6 +42,51 @@ class DoclingParsePageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid: if not self.valid:
return "" return ""
@ -70,75 +121,45 @@ class DoclingParsePageBackend(PdfPageBackend):
return text_piece return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]: def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid: if not self.valid:
return cells return None
page_size = self.get_size() page_size = self.get_size()
text_cells = self._compute_text_cells()
parser_width = self._dpage["width"] # Create page geometry
parser_height = self._dpage["height"] crop_bbox = BoundingBox(
l=0,
r=page_size.width,
t=0,
b=page_size.height,
coord_origin=CoordOrigin.TOPLEFT,
).to_bottom_left_origin(page_size.height)
for i in range(len(self._dpage["cells"])): dimension = PdfPageGeometry(
rect = self._dpage["cells"][i]["box"]["device"] angle=0.0,
x0, y0, x1, y1 = rect rect=BoundingRectangle.from_bounding_box(crop_bbox),
boundary_type=PdfPageBoundaryType.CROP_BOX,
art_bbox=crop_bbox,
bleed_bbox=crop_bbox,
crop_bbox=crop_bbox,
media_bbox=crop_bbox,
trim_bbox=crop_bbox,
)
if x1 < x0: # Create SegmentedPdfPage
x0, x1 = x1, x0 return SegmentedPdfPage(
if y1 < y0: dimension=dimension,
y0, y1 = y1, y0 textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_lines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
text_piece = self._dpage["cells"][i]["content"]["rnormalized"] def get_text_cells(self) -> Iterable[TextCell]:
cells.append( return self._compute_text_cells()
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge:
# draw_clusters_and_cells()
# cells = merge_horizontal_cells(cells)
# after merge:
# draw_clusters_and_cells()
return cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32 AREA_THRESHOLD = 0 # 32 * 32

View File

@ -7,7 +7,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v2 from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
@ -40,6 +46,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse v2 data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid: if not self.valid:
return "" return ""
@ -81,73 +136,45 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]: def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid: if not self.valid:
return cells return None
page_size = self.get_size() page_size = self.get_size()
text_cells = self._compute_text_cells()
parser_width = self._dpage["sanitized"]["dimension"]["width"] # Create page geometry
parser_height = self._dpage["sanitized"]["dimension"]["height"] crop_bbox = BoundingBox(
l=0,
r=page_size.width,
t=0,
b=page_size.height,
coord_origin=CoordOrigin.TOPLEFT,
).to_bottom_left_origin(page_size.height)
cells_data = self._dpage["sanitized"]["cells"]["data"] dimension = PdfPageGeometry(
cells_header = self._dpage["sanitized"]["cells"]["header"] angle=0.0,
rect=BoundingRectangle.from_bounding_box(crop_bbox),
boundary_type=PdfPageBoundaryType.CROP_BOX,
art_bbox=crop_bbox,
bleed_bbox=crop_bbox,
crop_bbox=crop_bbox,
media_bbox=crop_bbox,
trim_bbox=crop_bbox,
)
for i, cell_data in enumerate(cells_data): # Create SegmentedPdfPage
x0 = cell_data[cells_header.index("x0")] return SegmentedPdfPage(
y0 = cell_data[cells_header.index("y0")] dimension=dimension,
x1 = cell_data[cells_header.index("x1")] textline_cells=text_cells,
y1 = cell_data[cells_header.index("y1")] char_cells=[],
word_cells=[],
has_lines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
if x1 < x0: def get_text_cells(self) -> Iterable[TextCell]:
x0, x1 = x1, x0 return self._compute_text_cells()
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
return cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32 AREA_THRESHOLD = 0 # 32 * 32

View File

@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError from pypdfium2._helpers.misc import PdfiumError
@ -41,38 +47,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def _compute_text_cells(self) -> List[TextCell]:
AREA_THRESHOLD = 0 # 32 * 32 """Compute text cells from pypdfium."""
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
with pypdfium2_lock: with pypdfium2_lock:
if not self.text_page: if not self.text_page:
self.text_page = self._ppage.get_textpage() self.text_page = self._ppage.get_textpage()
@ -203,30 +179,76 @@ class PyPdfiumPageBackend(PdfPageBackend):
return merged_cells return merged_cells
def draw_clusters_and_cells(): return merge_horizontal_cells(cells)
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
# draw_clusters_and_cells() AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
cells = merge_horizontal_cells(cells) if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
# after merge: yield cropbox
# draw_clusters_and_cells()
return cells def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
if not self.valid:
return None
page_size = self.get_size()
text_cells = self._compute_text_cells()
# Create page geometry
crop_bbox = BoundingBox(
l=0,
r=page_size.width,
t=0,
b=page_size.height,
coord_origin=CoordOrigin.TOPLEFT,
).to_bottom_left_origin(page_size.height)
dimension = PdfPageGeometry(
angle=0.0,
rect=BoundingRectangle.from_bounding_box(crop_bbox),
boundary_type=PdfPageBoundaryType.CROP_BOX,
art_bbox=crop_bbox,
bleed_bbox=crop_bbox,
crop_bbox=crop_bbox,
media_bbox=crop_bbox,
trim_bbox=crop_bbox,
)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_lines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
return self._compute_text_cells()
def get_page_image( def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None self, scale: float = 1, cropbox: Optional[BoundingBox] = None

View File

@ -232,7 +232,6 @@ class Page(BaseModel):
page_no: int page_no: int
# page_hash: Optional[str] = None # page_hash: Optional[str] = None
size: Optional[Size] = None size: Optional[Size] = None
cells: List[TextCell] = []
parsed_page: Optional[SegmentedPdfPage] = None parsed_page: Optional[SegmentedPdfPage] = None
predictions: PagePredictions = PagePredictions() predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None
@ -245,6 +244,14 @@ class Page(BaseModel):
float, Image float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling. ] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
def cells(self) -> List[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
else:
return []
def get_image( def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
) -> Optional[Image]: ) -> Optional[Image]:

View File

@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
), ),
) )
generate_parsed_pages: bool = False generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
class PdfPipeline(str, Enum): class PdfPipeline(str, Enum):

View File

@ -133,20 +133,19 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
def post_process_cells(self, ocr_cells, page): def post_process_cells(self, ocr_cells, page):
r""" r"""
Post-process the OCR cells and update the page object. Post-process the OCR cells and update the page object.
Treats page.parsed_page as authoritative when available, with page.cells for compatibility. Updates parsed_page.textline_cells directly since page.cells is now read-only.
""" """
# Get existing cells (prefer parsed_page, fallback to page.cells) # Get existing cells from the read-only property
existing_cells = self._get_existing_cells(page) existing_cells = page.cells
# Combine existing and OCR cells with overlap filtering # Combine existing and OCR cells with overlap filtering
final_cells = self._combine_cells(existing_cells, ocr_cells) final_cells = self._combine_cells(existing_cells, ocr_cells)
# Update both structures efficiently assert page.parsed_page is not None
self._update_page_structures(page, final_cells)
def _get_existing_cells(self, page): # Update parsed_page.textline_cells directly
"""Get existing cells, preferring parsed_page when available.""" page.parsed_page.textline_cells = final_cells
return page.parsed_page.textline_cells if page.parsed_page else page.cells page.parsed_page.has_lines = bool(final_cells)
def _combine_cells(self, existing_cells, ocr_cells): def _combine_cells(self, existing_cells, ocr_cells):
"""Combine existing and OCR cells with filtering and re-indexing.""" """Combine existing and OCR cells with filtering and re-indexing."""
@ -162,18 +161,6 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
return combined return combined
def _update_page_structures(self, page, final_cells):
"""Update both page structures efficiently."""
if page.parsed_page:
# Update parsed_page as primary source
page.parsed_page.textline_cells = final_cells
page.parsed_page.has_lines = bool(final_cells)
# Sync to page.cells for compatibility
page.cells = final_cells
else:
# Legacy fallback: only page.cells available
page.cells = final_cells
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)
scale_x = image.width / page.size.width scale_x = image.width / page.size.width

View File

@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
) )
) )
# page.cells is already updated by LayoutPostprocessor
page.predictions.layout = LayoutPrediction( page.predictions.layout = LayoutPrediction(
clusters=processed_clusters clusters=processed_clusters
) )

View File

@ -2,7 +2,7 @@ import re
import warnings import warnings
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Literal, Optional
import numpy as np import numpy as np
from PIL import ImageDraw from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel): class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float] images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel): class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page: def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None assert page._backend is not None
page.cells = list(page._backend.get_text_cells()) page.parsed_page = page._backend.get_segmented_page()
assert page.parsed_page is not None
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page # Rate the text quality from the PDF parser, and aggregate on page
text_scores = [] text_scores = []

View File

@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page.cells)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
PagePreprocessingModel( PagePreprocessingModel(
options=PagePreprocessingOptions( options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale, images_scale=pipeline_options.images_scale,
create_parsed_page=pipeline_options.generate_parsed_pages,
) )
), ),
# OCR # OCR

View File

@ -196,8 +196,7 @@ class LayoutPostprocessor:
def __init__(self, page, clusters: List[Cluster]): def __init__(self, page, clusters: List[Cluster]):
"""Initialize processor with page and clusters.""" """Initialize processor with page and clusters."""
# Get cells from best available source (prefer parsed_page) self.cells = page.cells
self.cells = self._get_page_cells(page)
self.page = page self.page = page
self.page_size = page.size self.page_size = page.size
self.all_clusters = clusters self.all_clusters = clusters
@ -215,24 +214,6 @@ class LayoutPostprocessor:
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES] [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
) )
def _get_page_cells(self, page):
"""Get cells from best available source (prefer parsed_page)."""
return (
page.parsed_page.textline_cells
if page.parsed_page is not None
else page.cells
)
def _update_page_structures(self, final_cells):
"""Update both page structures efficiently."""
if self.page.parsed_page is not None:
# Update parsed_page as primary source
self.page.parsed_page.textline_cells = final_cells
self.page.parsed_page.has_lines = len(final_cells) > 0
# Legacy fallback: only page.cells available
self.page.cells = final_cells
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]: def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
"""Main processing pipeline.""" """Main processing pipeline."""
self.regular_clusters = self._process_regular_clusters() self.regular_clusters = self._process_regular_clusters()
@ -259,8 +240,9 @@ class LayoutPostprocessor:
for child in cluster.children: for child in cluster.children:
child.cells = self._sort_cells(child.cells) child.cells = self._sort_cells(child.cells)
# Update page structures with processed cells assert self.page.parsed_page is not None
self._update_page_structures(self.cells) self.page.parsed_page.textline_cells = self.cells
self.page.parsed_page.has_lines = len(self.cells) > 0
return final_clusters, self.cells return final_clusters, self.cells

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
69.6796630536824, 72.33333333333333,
689.0124221922704, 691.5883585611979,
504.8720051760782, 503.3333333333333,
764.9216921155637 763.9216918945312
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
<document> <document>
<paragraph><location><page_1><loc_74><loc_16><loc_88><loc_18></location>package</paragraph> <paragraph><location><page_1><loc_75><loc_16><loc_88><loc_18></location>package</paragraph>
<paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph> <paragraph><location><page_1><loc_15><loc_9><loc_88><loc_15></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</paragraph>
</document> </document>

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
441.2561096985719, 444.6666666666667,
131.89488404865142, 131.58835856119788,
522.0347860494834, 521.6666666666666,
151.87873262042876 150.25502522786462
], ],
"page": 1, "page": 1,
"span": [ "span": [
@ -67,10 +67,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
89.23887497045128, 92.0,
77.02339852098021, 77.92169189453125,
523.208764293368, 523.0,
124.75312428291147 123.25502522786462
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<document> <document>
<paragraph><location><page_1><loc_82><loc_74><loc_84><loc_88></location>package</paragraph> <paragraph><location><page_1><loc_82><loc_75><loc_84><loc_88></location>package</paragraph>
</document> </document>

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
690.2441821046808, 691.6666666666666,
442.39487414368364, 444.53450520833337,
709.8255852011977, 710.3333333333334,
523.076601235155 521.5345052083334
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<document> <document>
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph> <paragraph><location><page_1><loc_16><loc_12><loc_18><loc_25></location>package</paragraph>
</document> </document>

View File

@ -44,10 +44,10 @@
"prov": [ "prov": [
{ {
"bbox": [ "bbox": [
131.21306574279092, 131.66666666666666,
74.12495603322407, 73.53450520833337,
152.19606490864376, 150.33333333333334,
154.19400205373182 150.53450520833331
], ],
"page": 1, "page": 1,
"span": [ "span": [

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text> <doctag><text><loc_61><loc_46><loc_423><loc_89>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag> </doctag>

View File

@ -42,10 +42,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 69.6796630536824, "l": 72.33333333333333,
"t": 764.9216921155637, "t": 763.9216918945312,
"r": 504.8720051760782, "r": 503.3333333333333,
"b": 689.0124221922704, "b": 691.5883585611979,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><text><loc_371><loc_410><loc_439><loc_422>package</text> <doctag><text><loc_374><loc_411><loc_438><loc_422>package</text>
<text><loc_75><loc_426><loc_440><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text> <text><loc_77><loc_427><loc_439><loc_454>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</text>
</doctag> </doctag>

View File

@ -45,10 +45,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 441.2561096985719, "l": 444.6666666666667,
"t": 151.87873262042876, "t": 150.25502522786462,
"r": 522.0347860494834, "r": 521.6666666666666,
"b": 131.89488404865142, "b": 131.58835856119788,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
@ -74,10 +74,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 89.23887497045128, "l": 92.0,
"t": 124.75312428291147, "t": 123.25502522786462,
"r": 523.208764293368, "r": 523.0,
"b": 77.02339852098021, "b": 77.92169189453125,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><page_header><loc_426><loc_60><loc_454><loc_424>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header> <doctag><page_header><loc_427><loc_61><loc_454><loc_423>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_410><loc_61><loc_422><loc_128>package</text> <text><loc_411><loc_62><loc_422><loc_127>package</text>
</doctag> </doctag>

View File

@ -45,10 +45,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 717.168585936602, "l": 718.6666666666666,
"t": 524.2990550512769, "t": 522.8678385416666,
"r": 764.8982839673505, "r": 764.0,
"b": 90.3291657283603, "b": 91.86783854166669,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
@ -74,10 +74,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 690.2441821046808, "l": 691.6666666666666,
"t": 523.076601235155, "t": 521.5345052083334,
"r": 709.8255852011977, "r": 710.3333333333334,
"b": 442.39487414368364, "b": 444.53450520833337,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,3 @@
<doctag><page_header><loc_46><loc_75><loc_75><loc_440>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header> <doctag><page_header><loc_46><loc_77><loc_73><loc_439>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained</page_header>
<text><loc_78><loc_370><loc_90><loc_438>package</text> <text><loc_78><loc_374><loc_89><loc_438>package</text>
</doctag> </doctag>

View File

@ -45,10 +45,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 77.10171545548258, "l": 78.0,
"t": 506.0744964609271, "t": 503.201171875,
"r": 126.08064862014129, "r": 123.33333333333333,
"b": 71.87755635676046, "b": 72.201171875,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [
@ -74,10 +74,10 @@
{ {
"page_no": 1, "page_no": 1,
"bbox": { "bbox": {
"l": 131.21306574279092, "l": 131.66666666666666,
"t": 154.19400205373182, "t": 150.53450520833331,
"r": 152.19606490864376, "r": 150.33333333333334,
"b": 74.12495603322407, "b": 73.53450520833337,
"coord_origin": "BOTTOMLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"charspan": [ "charspan": [

File diff suppressed because it is too large Load Diff