From a4f4e3fc5cf9822d192dc2cc6248010593f7e761 Mon Sep 17 00:00:00 2001 From: Myles McNamara Date: Fri, 5 Dec 2025 17:17:10 -0500 Subject: [PATCH] fix: Filter out PDF-extracted word/char cells when force_full_page_ocr is used When force_full_page_ocr=True, the OCR model correctly replaces textline_cells with OCR-extracted text. However, word_cells and char_cells from the PDF backend were not handled, causing downstream components like TableStructureModel to use unreliable PDF-extracted text containing GLYPH artifacts. Instead of clearing all word/char cells (which would be destructive for backends like mets_gbs that provide OCR-generated word cells), this fix filters out only cells where from_ocr=False, preserving any OCR-generated cells. This ensures TableStructureModel falls back to the OCR-extracted textline cells via its existing fallback logic when word_cells is empty or only contains OCR cells. Fixes issue where PDFs with problematic fonts (Type3, missing ToUnicode CMap) produced GLYPH artifacts in table content despite force_full_page_ocr being triggered. --- docling/models/base_ocr_model.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 75f36195..31f44ae0 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -154,14 +154,19 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions): page.parsed_page.textline_cells = final_cells page.parsed_page.has_lines = len(final_cells) > 0 - # When force_full_page_ocr is used, word/char-level cells from PDF - # are also unreliable. Clear them so downstream components (e.g., table + # When force_full_page_ocr is used, PDF-extracted word/char cells are + # unreliable. Filter out cells where from_ocr=False, keeping any OCR- + # generated cells. This ensures downstream components (e.g., table # structure model) fall back to OCR-extracted textline cells. if self.options.force_full_page_ocr: - page.parsed_page.word_cells = [] - page.parsed_page.char_cells = [] - page.parsed_page.has_words = False - page.parsed_page.has_chars = False + page.parsed_page.word_cells = [ + c for c in page.parsed_page.word_cells if c.from_ocr + ] + page.parsed_page.char_cells = [ + c for c in page.parsed_page.char_cells if c.from_ocr + ] + page.parsed_page.has_words = len(page.parsed_page.word_cells) > 0 + page.parsed_page.has_chars = len(page.parsed_page.char_cells) > 0 def _combine_cells( self, existing_cells: List[TextCell], ocr_cells: List[TextCell]