Keep page.parsed_page.textline_cells and page.cells in sync, including OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-10 15:05:56 +02:00
parent 6613b9e98b
commit e310c5cff3
7 changed files with 74 additions and 22 deletions

View File

@ -130,19 +130,49 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
] ]
return filtered_ocr_cells return filtered_ocr_cells
def post_process_cells(self, ocr_cells, programmatic_cells): def post_process_cells(self, ocr_cells, page):
r""" r"""
Post-process the ocr and programmatic cells and return the final list of of cells Post-process the OCR cells and update the page object.
Treats page.parsed_page as authoritative when available, with page.cells for compatibility.
""" """
if self.options.force_full_page_ocr: # Get existing cells (prefer parsed_page, fallback to page.cells)
# If a full page OCR is forced, use only the OCR cells existing_cells = self._get_existing_cells(page)
cells = ocr_cells
return cells
## Remove OCR cells which overlap with programmatic cells. # Combine existing and OCR cells with overlap filtering
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells) final_cells = self._combine_cells(existing_cells, ocr_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells # Update both structures efficiently
self._update_page_structures(page, final_cells)
def _get_existing_cells(self, page):
"""Get existing cells, preferring parsed_page when available."""
return page.parsed_page.textline_cells if page.parsed_page else page.cells
def _combine_cells(self, existing_cells, ocr_cells):
"""Combine existing and OCR cells with filtering and re-indexing."""
if self.options.force_full_page_ocr:
combined = ocr_cells
else:
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
combined = list(existing_cells) + filtered_ocr_cells
# Re-index in-place
for i, cell in enumerate(combined):
cell.index = i
return combined
def _update_page_structures(self, page, final_cells):
"""Update both page structures efficiently."""
if page.parsed_page:
# Update parsed_page as primary source
page.parsed_page.textline_cells = final_cells
page.parsed_page.has_lines = bool(final_cells)
# Sync to page.cells for compatibility
page.cells = final_cells
else:
# Legacy fallback: only page.cells available
page.cells = final_cells
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)

View File

@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
# Apply postprocessing # Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor( processed_clusters, processed_cells = LayoutPostprocessor(
page.cells, clusters, page.size page, clusters
).postprocess() ).postprocess()
# processed_clusters, processed_cells = clusters, page.cells # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings( warnings.filterwarnings(
@ -198,7 +198,7 @@ class LayoutModel(BasePageModel):
) )
) )
page.cells = processed_cells # page.cells is already updated by LayoutPostprocessor
page.predictions.layout = LayoutPrediction( page.predictions.layout = LayoutPrediction(
clusters=processed_clusters clusters=processed_clusters
) )

View File

@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
all_ocr_cells.append(cell) all_ocr_cells.append(cell)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -194,11 +194,12 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
} }
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size): def __init__(self, page, clusters: List[Cluster]):
"""Initialize processor with cells and clusters.""" """Initialize processor with page and clusters."""
"""Initialize processor with cells and spatial indices.""" # Get cells from best available source (prefer parsed_page)
self.cells = cells self.cells = self._get_page_cells(page)
self.page_size = page_size self.page = page
self.page_size = page.size
self.all_clusters = clusters self.all_clusters = clusters
self.regular_clusters = [ self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES c for c in clusters if c.label not in self.SPECIAL_TYPES
@ -214,6 +215,24 @@ class LayoutPostprocessor:
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES] [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
) )
def _get_page_cells(self, page):
"""Get cells from best available source (prefer parsed_page)."""
return (
page.parsed_page.textline_cells
if page.parsed_page is not None
else page.cells
)
def _update_page_structures(self, final_cells):
"""Update both page structures efficiently."""
if self.page.parsed_page is not None:
# Update parsed_page as primary source
self.page.parsed_page.textline_cells = final_cells
self.page.parsed_page.has_lines = len(final_cells) > 0
# Legacy fallback: only page.cells available
self.page.cells = final_cells
def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]: def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
"""Main processing pipeline.""" """Main processing pipeline."""
self.regular_clusters = self._process_regular_clusters() self.regular_clusters = self._process_regular_clusters()
@ -240,6 +259,9 @@ class LayoutPostprocessor:
for child in cluster.children: for child in cluster.children:
child.cells = self._sort_cells(child.cells) child.cells = self._sort_cells(child.cells)
# Update page structures with processed cells
self._update_page_structures(self.cells)
return final_clusters, self.cells return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]: def _process_regular_clusters(self) -> List[Cluster]: