docling/docowling/pipeline/simple_pipeline.py
2024-12-28 14:14:46 -03:00

57 lines
2.2 KiB
Python

import logging
from docowling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
from docowling.datamodel.base_models import ConversionStatus
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import PipelineOptions
from docowling.pipeline.base_pipeline import BasePipeline
from docowling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
class SimplePipeline(BasePipeline):
"""SimpleModelPipeline.
This class is used at the moment for formats / backends
which produce straight DoclingDocument output.
"""
def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options)
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
raise RuntimeError(
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
f"Can not convert this with simple pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
# Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
# a DoclingDocument straight.
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
conv_res.document = conv_res.input._backend.convert()
return conv_res
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
# This is called only if the previous steps didn't raise.
# Since we don't have anything else to evaluate, we can
# safely return SUCCESS.
return ConversionStatus.SUCCESS
@classmethod
def get_default_options(cls) -> PipelineOptions:
return PipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, DeclarativeDocumentBackend)