mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
use do_ flag in pipeline_options
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
7c8d7e222e
commit
ddb509628e
@ -64,6 +64,8 @@ class PipelineOptions(BaseModel):
|
|||||||
True # This defautl will be set to False on a future version of docling
|
True # This defautl will be set to False on a future version of docling
|
||||||
)
|
)
|
||||||
|
|
||||||
|
do_dummy_picture_classifer: bool = False
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PipelineOptions):
|
class PdfPipelineOptions(PipelineOptions):
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
@ -10,12 +10,19 @@ from docling.models.base_model import BaseEnrichmentModel
|
|||||||
|
|
||||||
|
|
||||||
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
||||||
|
|
||||||
|
def __init__(self, enabled: bool):
|
||||||
|
self.enabled = enabled
|
||||||
|
|
||||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
return isinstance(element, PictureItem)
|
return self.enabled and isinstance(element, PictureItem)
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||||
) -> Iterable[Any]:
|
) -> Iterable[Any]:
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
|
||||||
for element in element_batch:
|
for element in element_batch:
|
||||||
assert isinstance(element, PictureItem)
|
assert isinstance(element, PictureItem)
|
||||||
element.data.classification = PictureClassificationData(
|
element.data.classification = PictureClassificationData(
|
||||||
|
@ -84,7 +84,10 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||||
# DummyPictureClassifierEnrichmentModel()
|
# TODO Question: should we use the enabled flag or simply not add the model in the list?
|
||||||
|
DummyPictureClassifierEnrichmentModel(
|
||||||
|
enabled=pipeline_options.do_dummy_picture_classifer
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -58,6 +58,7 @@ def main():
|
|||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
pipeline_options.do_dummy_picture_classifer = True
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
|
Loading…
Reference in New Issue
Block a user