mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 22:28:31 +00:00
fix: Filter out PDF-extracted word/char cells when force_full_page_ocr is used
When force_full_page_ocr=True, the OCR model correctly replaces textline_cells with OCR-extracted text. However, word_cells and char_cells from the PDF backend were not handled, causing downstream components like TableStructureModel to use unreliable PDF-extracted text containing GLYPH artifacts. Instead of clearing all word/char cells (which would be destructive for backends like mets_gbs that provide OCR-generated word cells), this fix filters out only cells where from_ocr=False, preserving any OCR-generated cells. This ensures TableStructureModel falls back to the OCR-extracted textline cells via its existing fallback logic when word_cells is empty or only contains OCR cells. Fixes issue where PDFs with problematic fonts (Type3, missing ToUnicode CMap) produced GLYPH artifacts in table content despite force_full_page_ocr being triggered.
This commit is contained in:
@@ -154,14 +154,19 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|||||||
page.parsed_page.textline_cells = final_cells
|
page.parsed_page.textline_cells = final_cells
|
||||||
page.parsed_page.has_lines = len(final_cells) > 0
|
page.parsed_page.has_lines = len(final_cells) > 0
|
||||||
|
|
||||||
# When force_full_page_ocr is used, word/char-level cells from PDF
|
# When force_full_page_ocr is used, PDF-extracted word/char cells are
|
||||||
# are also unreliable. Clear them so downstream components (e.g., table
|
# unreliable. Filter out cells where from_ocr=False, keeping any OCR-
|
||||||
|
# generated cells. This ensures downstream components (e.g., table
|
||||||
# structure model) fall back to OCR-extracted textline cells.
|
# structure model) fall back to OCR-extracted textline cells.
|
||||||
if self.options.force_full_page_ocr:
|
if self.options.force_full_page_ocr:
|
||||||
page.parsed_page.word_cells = []
|
page.parsed_page.word_cells = [
|
||||||
page.parsed_page.char_cells = []
|
c for c in page.parsed_page.word_cells if c.from_ocr
|
||||||
page.parsed_page.has_words = False
|
]
|
||||||
page.parsed_page.has_chars = False
|
page.parsed_page.char_cells = [
|
||||||
|
c for c in page.parsed_page.char_cells if c.from_ocr
|
||||||
|
]
|
||||||
|
page.parsed_page.has_words = len(page.parsed_page.word_cells) > 0
|
||||||
|
page.parsed_page.has_chars = len(page.parsed_page.char_cells) > 0
|
||||||
|
|
||||||
def _combine_cells(
|
def _combine_cells(
|
||||||
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
|
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
|
||||||
|
|||||||
Reference in New Issue
Block a user