diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d57f1671..2b9d228c 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): kind: str + force_full_page_ocr: bool = False # If enabled a full page OCR is always applied bitmap_area_threshold: float = ( 0.05 # percentage of the area for a bitmap to processed with OCR ) diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 9d26a317..d8b3262e 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel): coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) # return full-page rectangle if sufficiently covered with bitmaps - if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold): + if self.options.force_full_page_ocr or coverage > max( + BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold + ): return [ BoundingBox( l=0, diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 1b8e914f..824242ca 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -5,7 +5,7 @@ import numpy import torch from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import EasyOcrOptions from docling.datamodel.settings import settings @@ -88,12 +88,18 @@ class EasyOcrModel(BaseOcrModel): ] all_ocr_cells.extend(cells) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells( - all_ocr_cells, page.cells - ) - - page.cells.extend(filtered_ocr_cells) + if self.options.force_full_page_ocr: + # If a full page OCR is forced, use only the OCR cells + page.cells = [ + Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox) + for c_ocr in all_ocr_cells + ] + else: + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) + page.cells.extend(filtered_ocr_cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 6f939351..daee0572 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.settings import settings @@ -170,12 +170,18 @@ class TesseractOcrCliModel(BaseOcrModel): ) all_ocr_cells.append(cell) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells( - all_ocr_cells, page.cells - ) - - page.cells.extend(filtered_ocr_cells) + if self.options.force_full_page_ocr: + # If a full page OCR is forced, use only the OCR cells + page.cells = [ + Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox) + for c_ocr in all_ocr_cells + ] + else: + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) + page.cells.extend(filtered_ocr_cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 83f23837..bb33327d 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -3,7 +3,7 @@ from typing import Iterable from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.settings import settings @@ -140,12 +140,18 @@ class TesseractOcrModel(BaseOcrModel): # del high_res_image all_ocr_cells.extend(cells) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells( - all_ocr_cells, page.cells - ) - - page.cells.extend(filtered_ocr_cells) + if self.options.force_full_page_ocr: + # If a full page OCR is forced, use only the OCR cells + page.cells = [ + Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox) + for c_ocr in all_ocr_cells + ] + else: + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) + page.cells.extend(filtered_ocr_cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 2aeda467..324a4a14 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -15,34 +15,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 -GENERATE = False - - -# Debug -def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str): - r""" """ - import json - import os - - parent = pdf_path.parent - eng = "" if engine is None else f".{engine}" - - dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json") - with open(dict_fn, "w") as fd: - json.dump(doc_result.legacy_document.export_to_dict(), fd) - - pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json") - pages = [p.model_dump() for p in doc_result.pages] - with open(pages_fn, "w") as fd: - json.dump(pages, fd) - - doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt") - with open(doctags_fn, "w") as fd: - fd.write(doc_result.legacy_document.export_to_doctags()) - - md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md") - with open(md_fn, "w") as fd: - fd.write(doc_result.legacy_document.export_to_markdown()) +GENERATE_V1 = False +GENERATE_V2 = False def get_pdf_paths(): @@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions): def test_e2e_conversions(): - pdf_paths = get_pdf_paths() engines: List[OcrOptions] = [ EasyOcrOptions(), TesseractOcrOptions(), TesseractCliOcrOptions(), + EasyOcrOptions(force_full_page_ocr=True), + TesseractOcrOptions(force_full_page_ocr=True), + TesseractCliOcrOptions(force_full_page_ocr=True), ] for ocr_options in engines: @@ -91,20 +67,16 @@ def test_e2e_conversions(): doc_result: ConversionResult = converter.convert(pdf_path) - # Save conversions - # save_output(pdf_path, doc_result, None) - - # Debug verify_conversion_result_v1( input_path=pdf_path, doc_result=doc_result, - generate=GENERATE, + generate=GENERATE_V1, fuzzy=True, ) verify_conversion_result_v2( input_path=pdf_path, doc_result=doc_result, - generate=GENERATE, + generate=GENERATE_V2, fuzzy=True, ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 20f5eef0..c444266b 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -256,15 +256,19 @@ def verify_conversion_result_v1( dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth + pages_path.parent.mkdir(parents=True, exist_ok=True) with open(pages_path, "w") as fw: fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder)) + json_path.parent.mkdir(parents=True, exist_ok=True) with open(json_path, "w") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder)) + md_path.parent.mkdir(parents=True, exist_ok=True) with open(md_path, "w") as fw: fw.write(doc_pred_md) + dt_path.parent.mkdir(parents=True, exist_ok=True) with open(dt_path, "w") as fw: fw.write(doc_pred_dt) else: # default branch in test @@ -328,15 +332,19 @@ def verify_conversion_result_v2( dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth + pages_path.parent.mkdir(parents=True, exist_ok=True) with open(pages_path, "w") as fw: fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder)) + json_path.parent.mkdir(parents=True, exist_ok=True) with open(json_path, "w") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder)) + md_path.parent.mkdir(parents=True, exist_ok=True) with open(md_path, "w") as fw: fw.write(doc_pred_md) + dt_path.parent.mkdir(parents=True, exist_ok=True) with open(dt_path, "w") as fw: fw.write(doc_pred_dt) else: # default branch in test