From e613f7bc6c98d3236b416dc55cb4c89717b9316d Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 7 Oct 2024 12:35:25 +0200 Subject: [PATCH] Add comments Signed-off-by: Christoph Auer --- docling/pipeline/base_model_pipeline.py | 9 +++++++-- docling/pipeline/simple_model_pipeline.py | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py index b4898148..8b170db8 100644 --- a/docling/pipeline/base_model_pipeline.py +++ b/docling/pipeline/base_model_pipeline.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from typing import Callable, Iterable, List from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, DoclingComponentType, @@ -46,7 +47,7 @@ class BaseModelPipeline(ABC): pass -class PaginatedModelPipeline(BaseModelPipeline): +class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name. def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: for model in self.model_pipe: @@ -59,6 +60,10 @@ class PaginatedModelPipeline(BaseModelPipeline): _log.info(f"Processing document {in_doc.file.name}") + if not isinstance(in_doc._backend, PdfDocumentBackend): + conv_res.status = ConversionStatus.FAILURE + return conv_res + for i in range(0, in_doc.page_count): conv_res.pages.append(Page(page_no=i)) @@ -75,7 +80,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # 2. Run pipeline stages pipeline_pages = self.apply_on_pages(init_pages) - for p in pipeline_pages: + for p in pipeline_pages: # Must exhaust! pass end_pb_time = time.time() - start_pb_time diff --git a/docling/pipeline/simple_model_pipeline.py b/docling/pipeline/simple_model_pipeline.py index efb7439b..70dfc5fd 100644 --- a/docling/pipeline/simple_model_pipeline.py +++ b/docling/pipeline/simple_model_pipeline.py @@ -16,6 +16,11 @@ _log = logging.getLogger(__name__) class SimpleModelPipeline(BaseModelPipeline): + """SimpleModelPipeline. + + This class is used at the moment for formats / backends + which produce straight DoclingDocument output. + """ def __init__(self, pipeline_options: PdfPipelineOptions): super().__init__(pipeline_options) @@ -33,6 +38,10 @@ class SimpleModelPipeline(BaseModelPipeline): conv_res.status = ConversionStatus.FAILURE return conv_res + # Instead of running a page-level pipeline to build up the document structure, + # the backend is expected to be of type DeclarativeDocumentBackend, which can output + # a DoclingDocument straight. + conv_res.experimental = in_doc._backend.convert() # Do other stuff with conv_res.experimental