Exposed "force_backend_text" as pipeline parameter

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2025-01-16 14:23:59 +01:00
parent 0dc3ac43b1
commit 9901729d8c
3 changed files with 20 additions and 14 deletions

View File

@ -295,6 +295,10 @@ class PdfPipelineOptions(PipelineOptions):
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[ ocr_options: Union[

View File

@ -42,10 +42,9 @@ class VlmPipeline(PaginatedPipeline):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions self.pipeline_options: PdfPipelineOptions
# TODO: Move "use_backend_text" to pipeline parameters! # force_backend_text = False - use text that is coming from SmolDocling
# use_backend_text = False - use text that is coming from SmolDocling # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
# use_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss self.force_backend_text = pipeline_options.force_backend_text
self.use_backend_text = False
if pipeline_options.artifacts_path is None: if pipeline_options.artifacts_path is None:
self.artifacts_path = self.download_models_hf() self.artifacts_path = self.download_models_hf()
@ -324,7 +323,7 @@ class VlmPipeline(PaginatedPipeline):
line = line.replace("<doc_tag>", "") line = line.replace("<doc_tag>", "")
if line.startswith("<paragraph>"): if line.startswith("<paragraph>"):
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -345,7 +344,7 @@ class VlmPipeline(PaginatedPipeline):
) )
elif line.startswith("<title>"): elif line.startswith("<title>"):
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -370,7 +369,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<section-header>"): elif line.startswith("<section-header>"):
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -403,7 +402,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<footnote>"): elif line.startswith("<footnote>"):
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -424,7 +423,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<page-header>"): elif line.startswith("<page-header>"):
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -445,7 +444,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<page-footer>"): elif line.startswith("<page-footer>"):
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -496,7 +495,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<list>"): elif line.startswith("<list>"):
prov_item_inst = None prov_item_inst = None
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -515,7 +514,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<caption>"): elif line.startswith("<caption>"):
prov_item_inst = None prov_item_inst = None
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -533,7 +532,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<checkbox-unselected>"): elif line.startswith("<checkbox-unselected>"):
prov_item_inst = None prov_item_inst = None
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)
@ -552,7 +551,7 @@ class VlmPipeline(PaginatedPipeline):
elif line.startswith("<checkbox-selected>"): elif line.startswith("<checkbox-selected>"):
prov_item_inst = None prov_item_inst = None
prov_item = extract_bounding_box(line) prov_item = extract_bounding_box(line)
if self.use_backend_text: if self.force_backend_text:
content = extract_text_from_backend(page, prov_item) content = extract_text_from_backend(page, prov_item)
else: else:
content = extract_text(line) content = extract_text(line)

View File

@ -21,6 +21,9 @@ sources = [
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True pipeline_options.generate_page_images = True
pipeline_options.force_backend_text = (
False # If True, text from backend will be used instead of generated text
)
pipeline_options.artifacts_path = "model_artifacts" pipeline_options.artifacts_path = "model_artifacts"
from docling_core.types.doc import DocItemLabel, ImageRefMode from docling_core.types.doc import DocItemLabel, ImageRefMode