mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Add comments
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cefc34e8d8
commit
e613f7bc6c
@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
@ -46,7 +47,7 @@ class BaseModelPipeline(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class PaginatedModelPipeline(BaseModelPipeline):
|
||||
class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
|
||||
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for model in self.model_pipe:
|
||||
@ -59,6 +60,10 @@ class PaginatedModelPipeline(BaseModelPipeline):
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
@ -75,7 +80,7 @@ class PaginatedModelPipeline(BaseModelPipeline):
|
||||
# 2. Run pipeline stages
|
||||
pipeline_pages = self.apply_on_pages(init_pages)
|
||||
|
||||
for p in pipeline_pages:
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
pass
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
|
@ -16,6 +16,11 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SimpleModelPipeline(BaseModelPipeline):
|
||||
"""SimpleModelPipeline.
|
||||
|
||||
This class is used at the moment for formats / backends
|
||||
which produce straight DoclingDocument output.
|
||||
"""
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
@ -33,6 +38,10 @@ class SimpleModelPipeline(BaseModelPipeline):
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
# Instead of running a page-level pipeline to build up the document structure,
|
||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||
# a DoclingDocument straight.
|
||||
|
||||
conv_res.experimental = in_doc._backend.convert()
|
||||
|
||||
# Do other stuff with conv_res.experimental
|
||||
|
Loading…
Reference in New Issue
Block a user