Add comments

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-07 12:35:25 +02:00
parent cefc34e8d8
commit e613f7bc6c
2 changed files with 16 additions and 2 deletions

View File

@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
from typing import Callable, Iterable, List
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
@ -46,7 +47,7 @@ class BaseModelPipeline(ABC):
pass
class PaginatedModelPipeline(BaseModelPipeline):
class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe:
@ -59,6 +60,10 @@ class PaginatedModelPipeline(BaseModelPipeline):
_log.info(f"Processing document {in_doc.file.name}")
if not isinstance(in_doc._backend, PdfDocumentBackend):
conv_res.status = ConversionStatus.FAILURE
return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
@ -75,7 +80,7 @@ class PaginatedModelPipeline(BaseModelPipeline):
# 2. Run pipeline stages
pipeline_pages = self.apply_on_pages(init_pages)
for p in pipeline_pages:
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time

View File

@ -16,6 +16,11 @@ _log = logging.getLogger(__name__)
class SimpleModelPipeline(BaseModelPipeline):
"""SimpleModelPipeline.
This class is used at the moment for formats / backends
which produce straight DoclingDocument output.
"""
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
@ -33,6 +38,10 @@ class SimpleModelPipeline(BaseModelPipeline):
conv_res.status = ConversionStatus.FAILURE
return conv_res
# Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
# a DoclingDocument straight.
conv_res.experimental = in_doc._backend.convert()
# Do other stuff with conv_res.experimental