From edd4356aac25b62c30cae6d2e8c69095f63bd442 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Mon, 7 Jul 2025 16:23:16 +0200 Subject: [PATCH] fix: use only backend for picture classifier (#1904) use backend for picture classifier Signed-off-by: Michele Dolfi --- docling/models/document_picture_classifier.py | 25 +++++++++---------- docling/pipeline/standard_pdf_pipeline.py | 1 + tests/test_document_picture_classifier.py | 3 ++- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 73a30203..24e45078 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -14,7 +14,8 @@ from PIL import Image from pydantic import BaseModel from docling.datamodel.accelerator_options import AcceleratorOptions -from docling.models.base_model import BaseEnrichmentModel +from docling.datamodel.base_models import ItemAndImageEnrichmentElement +from docling.models.base_model import BaseItemAndImageEnrichmentModel from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device @@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel): kind: Literal["document_picture_classifier"] = "document_picture_classifier" -class DocumentPictureClassifier(BaseEnrichmentModel): +class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel): """ A model for classifying pictures in documents. @@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): def __call__( self, doc: DoclingDocument, - element_batch: Iterable[NodeItem], + element_batch: Iterable[ItemAndImageEnrichmentElement], ) -> Iterable[NodeItem]: """ Processes a batch of elements and enriches them with classification predictions. @@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ---------- doc : DoclingDocument The document containing the elements to be processed. - element_batch : Iterable[NodeItem] + element_batch : Iterable[ItemAndImageEnrichmentElement] A batch of pictures to classify. Returns @@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel): """ if not self.enabled: for element in element_batch: - yield element + yield element.item return images: List[Union[Image.Image, np.ndarray]] = [] elements: List[PictureItem] = [] for el in element_batch: - assert isinstance(el, PictureItem) - elements.append(el) - img = el.get_image(doc) - assert img is not None - images.append(img) + assert isinstance(el.item, PictureItem) + elements.append(el.item) + images.append(el.image) outputs = self.document_picture_classifier.predict(images) - for element, output in zip(elements, outputs): - element.annotations.append( + for item, output in zip(elements, outputs): + item.annotations.append( PictureClassificationData( provenance="DocumentPictureClassifier", predicted_classes=[ @@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ) ) - yield element + yield item diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 8861174a..de76ef24 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -129,6 +129,7 @@ class StandardPdfPipeline(PaginatedPipeline): if ( self.pipeline_options.do_formula_enrichment or self.pipeline_options.do_code_enrichment + or self.pipeline_options.do_picture_classification or self.pipeline_options.do_picture_description ): self.keep_backend = True diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 5dc5e926..3a43a61a 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -17,8 +17,9 @@ def get_converter(): pipeline_options.do_table_structure = False pipeline_options.do_code_enrichment = False pipeline_options.do_formula_enrichment = False + pipeline_options.generate_picture_images = False + pipeline_options.generate_page_images = False pipeline_options.do_picture_classification = True - pipeline_options.generate_picture_images = True pipeline_options.images_scale = 2 converter = DocumentConverter(