use do_ flag in pipeline_options

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-13 16:54:46 +02:00
parent 7c8d7e222e
commit ddb509628e
4 changed files with 15 additions and 2 deletions

View File

@ -64,6 +64,8 @@ class PipelineOptions(BaseModel):
True # This defautl will be set to False on a future version of docling
)
do_dummy_picture_classifer: bool = False
class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None

View File

@ -10,12 +10,19 @@ from docling.models.base_model import BaseEnrichmentModel
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
def __init__(self, enabled: bool):
self.enabled = enabled
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return isinstance(element, PictureItem)
return self.enabled and isinstance(element, PictureItem)
def __call__(
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]:
if not self.enabled:
return
for element in element_batch:
assert isinstance(element, PictureItem)
element.data.classification = PictureClassificationData(

View File

@ -84,7 +84,10 @@ class StandardPdfPipeline(PaginatedPipeline):
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument
# DummyPictureClassifierEnrichmentModel()
# TODO Question: should we use the enabled flag or simply not add the model in the list?
DummyPictureClassifierEnrichmentModel(
enabled=pipeline_options.do_dummy_picture_classifer
)
]
@staticmethod

View File

@ -58,6 +58,7 @@ def main():
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.do_dummy_picture_classifer = True
doc_converter = DocumentConverter(
format_options={