From 4197a4e273250637e474804517c0cd76bf5ea56e Mon Sep 17 00:00:00 2001 From: Myles McNamara Date: Fri, 5 Dec 2025 17:03:47 -0500 Subject: [PATCH] fix: Clear word/char cells when force_full_page_ocr is used When force_full_page_ocr=True, the OCR model correctly replaces textline_cells with OCR-extracted text. However, word_cells and char_cells were not cleared, causing downstream components like TableStructureModel to use unreliable PDF-extracted text containing GLYPH artifacts (e.g., GLYPH). This fix clears word_cells and char_cells when force_full_page_ocr is enabled, ensuring TableStructureModel falls back to the OCR- extracted textline cells via its existing fallback logic. Fixes issue where PDFs with problematic fonts (Type3, missing ToUnicode CMap) produced GLYPH artifacts in table content despite force_full_page_ocr being triggered. --- docling/models/base_ocr_model.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 67ada340..75f36195 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -154,6 +154,15 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions): page.parsed_page.textline_cells = final_cells page.parsed_page.has_lines = len(final_cells) > 0 + # When force_full_page_ocr is used, word/char-level cells from PDF + # are also unreliable. Clear them so downstream components (e.g., table + # structure model) fall back to OCR-extracted textline cells. + if self.options.force_full_page_ocr: + page.parsed_page.word_cells = [] + page.parsed_page.char_cells = [] + page.parsed_page.has_words = False + page.parsed_page.has_chars = False + def _combine_cells( self, existing_cells: List[TextCell], ocr_cells: List[TextCell] ) -> List[TextCell]: