mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Add comments
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cefc34e8d8
commit
e613f7bc6c
@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
|
|||||||
from typing import Callable, Iterable, List
|
from typing import Callable, Iterable, List
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DoclingComponentType,
|
DoclingComponentType,
|
||||||
@ -46,7 +47,7 @@ class BaseModelPipeline(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PaginatedModelPipeline(BaseModelPipeline):
|
class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||||
|
|
||||||
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
for model in self.model_pipe:
|
for model in self.model_pipe:
|
||||||
@ -59,6 +60,10 @@ class PaginatedModelPipeline(BaseModelPipeline):
|
|||||||
|
|
||||||
_log.info(f"Processing document {in_doc.file.name}")
|
_log.info(f"Processing document {in_doc.file.name}")
|
||||||
|
|
||||||
|
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
return conv_res
|
||||||
|
|
||||||
for i in range(0, in_doc.page_count):
|
for i in range(0, in_doc.page_count):
|
||||||
conv_res.pages.append(Page(page_no=i))
|
conv_res.pages.append(Page(page_no=i))
|
||||||
|
|
||||||
@ -75,7 +80,7 @@ class PaginatedModelPipeline(BaseModelPipeline):
|
|||||||
# 2. Run pipeline stages
|
# 2. Run pipeline stages
|
||||||
pipeline_pages = self.apply_on_pages(init_pages)
|
pipeline_pages = self.apply_on_pages(init_pages)
|
||||||
|
|
||||||
for p in pipeline_pages:
|
for p in pipeline_pages: # Must exhaust!
|
||||||
pass
|
pass
|
||||||
|
|
||||||
end_pb_time = time.time() - start_pb_time
|
end_pb_time = time.time() - start_pb_time
|
||||||
|
@ -16,6 +16,11 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class SimpleModelPipeline(BaseModelPipeline):
|
class SimpleModelPipeline(BaseModelPipeline):
|
||||||
|
"""SimpleModelPipeline.
|
||||||
|
|
||||||
|
This class is used at the moment for formats / backends
|
||||||
|
which produce straight DoclingDocument output.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
@ -33,6 +38,10 @@ class SimpleModelPipeline(BaseModelPipeline):
|
|||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
|
# Instead of running a page-level pipeline to build up the document structure,
|
||||||
|
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||||
|
# a DoclingDocument straight.
|
||||||
|
|
||||||
conv_res.experimental = in_doc._backend.convert()
|
conv_res.experimental = in_doc._backend.convert()
|
||||||
|
|
||||||
# Do other stuff with conv_res.experimental
|
# Do other stuff with conv_res.experimental
|
||||||
|
Loading…
Reference in New Issue
Block a user