From 9901729d8caa4ef50c6935843b8a16c6d460b2b9 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Thu, 16 Jan 2025 14:23:59 +0100 Subject: [PATCH] Exposed "force_backend_text" as pipeline parameter Signed-off-by: Maksym Lysak --- docling/datamodel/pipeline_options.py | 4 ++++ docling/pipeline/vlm_pipeline.py | 27 +++++++++++++-------------- docs/examples/minimal_smol_docling.py | 3 +++ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d317e7d9..f08a1bff 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -295,6 +295,10 @@ class PdfPipelineOptions(PipelineOptions): do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code do_picture_classification: bool = False # True: classify pictures in documents do_picture_description: bool = False # True: run describe pictures in documents + force_backend_text: bool = ( + False # (To be used with vlms, or other generative models) + ) + # If True, text from backend will be used instead of generated text table_structure_options: TableStructureOptions = TableStructureOptions() ocr_options: Union[ diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 318d56ea..20015748 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -42,10 +42,9 @@ class VlmPipeline(PaginatedPipeline): super().__init__(pipeline_options) self.pipeline_options: PdfPipelineOptions - # TODO: Move "use_backend_text" to pipeline parameters! - # use_backend_text = False - use text that is coming from SmolDocling - # use_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss - self.use_backend_text = False + # force_backend_text = False - use text that is coming from SmolDocling + # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss + self.force_backend_text = pipeline_options.force_backend_text if pipeline_options.artifacts_path is None: self.artifacts_path = self.download_models_hf() @@ -324,7 +323,7 @@ class VlmPipeline(PaginatedPipeline): line = line.replace("", "") if line.startswith(""): prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -345,7 +344,7 @@ class VlmPipeline(PaginatedPipeline): ) elif line.startswith(""): prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -370,7 +369,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<section-header>"): prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -403,7 +402,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<footnote>"): prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -424,7 +423,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<page-header>"): prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -445,7 +444,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<page-footer>"): prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -496,7 +495,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<list>"): prov_item_inst = None prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -515,7 +514,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<caption>"): prov_item_inst = None prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -533,7 +532,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<checkbox-unselected>"): prov_item_inst = None prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) @@ -552,7 +551,7 @@ class VlmPipeline(PaginatedPipeline): elif line.startswith("<checkbox-selected>"): prov_item_inst = None prov_item = extract_bounding_box(line) - if self.use_backend_text: + if self.force_backend_text: content = extract_text_from_backend(page, prov_item) else: content = extract_text(line) diff --git a/docs/examples/minimal_smol_docling.py b/docs/examples/minimal_smol_docling.py index 97ae9a78..14e340b9 100644 --- a/docs/examples/minimal_smol_docling.py +++ b/docs/examples/minimal_smol_docling.py @@ -21,6 +21,9 @@ sources = [ pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True +pipeline_options.force_backend_text = ( + False # If True, text from backend will be used instead of generated text +) pipeline_options.artifacts_path = "model_artifacts" from docling_core.types.doc import DocItemLabel, ImageRefMode