mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Exposed "force_backend_text" as pipeline parameter
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
0dc3ac43b1
commit
9901729d8c
@ -295,6 +295,10 @@ class PdfPipelineOptions(PipelineOptions):
|
|||||||
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
||||||
do_picture_classification: bool = False # True: classify pictures in documents
|
do_picture_classification: bool = False # True: classify pictures in documents
|
||||||
do_picture_description: bool = False # True: run describe pictures in documents
|
do_picture_description: bool = False # True: run describe pictures in documents
|
||||||
|
force_backend_text: bool = (
|
||||||
|
False # (To be used with vlms, or other generative models)
|
||||||
|
)
|
||||||
|
# If True, text from backend will be used instead of generated text
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
ocr_options: Union[
|
ocr_options: Union[
|
||||||
|
@ -42,10 +42,9 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.pipeline_options: PdfPipelineOptions
|
self.pipeline_options: PdfPipelineOptions
|
||||||
|
|
||||||
# TODO: Move "use_backend_text" to pipeline parameters!
|
# force_backend_text = False - use text that is coming from SmolDocling
|
||||||
# use_backend_text = False - use text that is coming from SmolDocling
|
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
|
||||||
# use_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss
|
self.force_backend_text = pipeline_options.force_backend_text
|
||||||
self.use_backend_text = False
|
|
||||||
|
|
||||||
if pipeline_options.artifacts_path is None:
|
if pipeline_options.artifacts_path is None:
|
||||||
self.artifacts_path = self.download_models_hf()
|
self.artifacts_path = self.download_models_hf()
|
||||||
@ -324,7 +323,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
line = line.replace("<doc_tag>", "")
|
line = line.replace("<doc_tag>", "")
|
||||||
if line.startswith("<paragraph>"):
|
if line.startswith("<paragraph>"):
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -345,7 +344,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
elif line.startswith("<title>"):
|
elif line.startswith("<title>"):
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -370,7 +369,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
elif line.startswith("<section-header>"):
|
elif line.startswith("<section-header>"):
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -403,7 +402,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
elif line.startswith("<footnote>"):
|
elif line.startswith("<footnote>"):
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -424,7 +423,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
elif line.startswith("<page-header>"):
|
elif line.startswith("<page-header>"):
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -445,7 +444,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
elif line.startswith("<page-footer>"):
|
elif line.startswith("<page-footer>"):
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -496,7 +495,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
elif line.startswith("<list>"):
|
elif line.startswith("<list>"):
|
||||||
prov_item_inst = None
|
prov_item_inst = None
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -515,7 +514,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
elif line.startswith("<caption>"):
|
elif line.startswith("<caption>"):
|
||||||
prov_item_inst = None
|
prov_item_inst = None
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -533,7 +532,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
elif line.startswith("<checkbox-unselected>"):
|
elif line.startswith("<checkbox-unselected>"):
|
||||||
prov_item_inst = None
|
prov_item_inst = None
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
@ -552,7 +551,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
elif line.startswith("<checkbox-selected>"):
|
elif line.startswith("<checkbox-selected>"):
|
||||||
prov_item_inst = None
|
prov_item_inst = None
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if self.use_backend_text:
|
if self.force_backend_text:
|
||||||
content = extract_text_from_backend(page, prov_item)
|
content = extract_text_from_backend(page, prov_item)
|
||||||
else:
|
else:
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
|
@ -21,6 +21,9 @@ sources = [
|
|||||||
|
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.generate_page_images = True
|
pipeline_options.generate_page_images = True
|
||||||
|
pipeline_options.force_backend_text = (
|
||||||
|
False # If True, text from backend will be used instead of generated text
|
||||||
|
)
|
||||||
pipeline_options.artifacts_path = "model_artifacts"
|
pipeline_options.artifacts_path = "model_artifacts"
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||||
|
Loading…
Reference in New Issue
Block a user