use do_ flag in pipeline_options

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-13 16:54:46 +02:00
parent 7c8d7e222e
commit ddb509628e
4 changed files with 15 additions and 2 deletions

View File

@ -64,6 +64,8 @@ class PipelineOptions(BaseModel):
True # This defautl will be set to False on a future version of docling True # This defautl will be set to False on a future version of docling
) )
do_dummy_picture_classifer: bool = False
class PdfPipelineOptions(PipelineOptions): class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None

View File

@ -10,12 +10,19 @@ from docling.models.base_model import BaseEnrichmentModel
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel): class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
def __init__(self, enabled: bool):
self.enabled = enabled
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return isinstance(element, PictureItem) return self.enabled and isinstance(element, PictureItem)
def __call__( def __call__(
self, doc: DoclingDocument, element_batch: Iterable[NodeItem] self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]: ) -> Iterable[Any]:
if not self.enabled:
return
for element in element_batch: for element in element_batch:
assert isinstance(element, PictureItem) assert isinstance(element, PictureItem)
element.data.classification = PictureClassificationData( element.data.classification = PictureClassificationData(

View File

@ -84,7 +84,10 @@ class StandardPdfPipeline(PaginatedPipeline):
self.enrichment_pipe = [ self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument # Other models working on `NodeItem` elements in the DoclingDocument
# DummyPictureClassifierEnrichmentModel() # TODO Question: should we use the enabled flag or simply not add the model in the list?
DummyPictureClassifierEnrichmentModel(
enabled=pipeline_options.do_dummy_picture_classifer
)
] ]
@staticmethod @staticmethod

View File

@ -58,6 +58,7 @@ def main():
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.do_dummy_picture_classifer = True
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
format_options={ format_options={