diff --git a/docling/document_converter.py b/docling/document_converter.py index f354d58b..524ffaae 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -19,16 +19,16 @@ from docling.datamodel.document import ( ) from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import DocumentLimits, settings -from docling.pipeline.base_model_pipeline import AbstractModelPipeline -from docling.pipeline.simple_model_pipeline import SimpleModelPipeline -from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline +from docling.pipeline.base_pipeline import AbstractPipeline +from docling.pipeline.simple_pipeline import SimplePipeline +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.utils.utils import chunkify _log = logging.getLogger(__name__) class FormatOption(BaseModel): - pipeline_cls: Type[AbstractModelPipeline] + pipeline_cls: Type[AbstractPipeline] pipeline_options: Optional[PipelineOptions] = None backend: Type[AbstractDocumentBackend] @@ -42,40 +42,40 @@ class FormatOption(BaseModel): class WordFormatOption(FormatOption): - pipeline_cls: Type = SimpleModelPipeline + pipeline_cls: Type = SimplePipeline backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend class PowerpointFormatOption(FormatOption): - pipeline_cls: Type = SimpleModelPipeline + pipeline_cls: Type = SimplePipeline backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend class HTMLFormatOption(FormatOption): - pipeline_cls: Type = SimpleModelPipeline + pipeline_cls: Type = SimplePipeline backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend class PdfFormatOption(FormatOption): - pipeline_cls: Type = StandardPdfModelPipeline + pipeline_cls: Type = StandardPdfPipeline backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend _format_to_default_options = { InputFormat.DOCX: FormatOption( - pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend + pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend ), InputFormat.PPTX: FormatOption( - pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend + pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend ), InputFormat.HTML: FormatOption( - pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend + pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend ), InputFormat.IMAGE: FormatOption( - pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend + pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend ), InputFormat.PDF: FormatOption( - pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend + pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend ), } @@ -85,29 +85,27 @@ class DocumentConverter: def __init__( self, - formats: Optional[List[InputFormat]] = None, + allowed_formats: Optional[List[InputFormat]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None, ): - self.formats = formats + self.allowed_formats = allowed_formats self.format_to_options = format_options - if self.formats is None: + if self.allowed_formats is None: if self.format_to_options is not None: - self.formats = self.format_to_options.keys() + self.allowed_formats = self.format_to_options.keys() else: - self.formats = [e for e in InputFormat] # all formats + self.allowed_formats = [e for e in InputFormat] # all formats if self.format_to_options is None: self.format_to_options = _format_to_default_options - for f in self.formats: + for f in self.allowed_formats: if f not in self.format_to_options.keys(): _log.info(f"Requested format {f} will use default options.") self.format_to_options[f] = _format_to_default_options[f] - self.initialized_pipelines: Dict[ - Type[AbstractModelPipeline], AbstractModelPipeline - ] = {} + self.initialized_pipelines: Dict[Type[AbstractPipeline], AbstractPipeline] = {} @validate_call(config=ConfigDict(strict=True)) def convert( @@ -173,7 +171,7 @@ class DocumentConverter: if item is not None: yield item - def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]: + def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractPipeline]: fopt = self.format_to_options.get(doc.format) if fopt is None: @@ -194,7 +192,7 @@ class DocumentConverter: return self.initialized_pipelines[pipeline_class] def process_document(self, in_doc: InputDocument) -> ConversionResult: - if in_doc.format not in self.formats: + if in_doc.format not in self.allowed_formats: return None else: start_doc_time = time.time() diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py deleted file mode 100644 index 7e3f9fea..00000000 --- a/docling/pipeline/base_model_pipeline.py +++ /dev/null @@ -1,167 +0,0 @@ -import functools -import logging -import time -import traceback -from abc import ABC, abstractmethod -from typing import Callable, Iterable, List - -from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import ( - ConversionStatus, - DoclingComponentType, - ErrorItem, - Page, -) -from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import PipelineOptions -from docling.datamodel.settings import settings -from docling.utils.utils import chunkify - -_log = logging.getLogger(__name__) - - -class AbstractModelPipeline(ABC): - def __init__(self, pipeline_options: PipelineOptions): - self.pipeline_options = pipeline_options - self.model_pipe: List[Callable] = [] - self.enrichment_pipe: List[Callable] = [] - - def execute(self, in_doc: InputDocument) -> ConversionResult: - conv_res = ConversionResult(input=in_doc) - - _log.info(f"Processing document {in_doc.file.name}") - - if not in_doc.valid: - conv_res.status = ConversionStatus.FAILURE - return conv_res - - # TODO: propagate option for raises_on_error? - try: - conv_res = self._build_document(in_doc, conv_res) - conv_res = self._assemble_document(in_doc, conv_res) - conv_res = self._enrich_document(in_doc, conv_res) - conv_res.status = self._determine_status(in_doc, conv_res) - except Exception as e: - conv_res.status = ConversionStatus.FAILURE - - return conv_res - - @abstractmethod - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: - pass - - def _assemble_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: - return conv_res - - def _enrich_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: - return conv_res - - @abstractmethod - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: - pass - - @classmethod - @abstractmethod - def get_default_options(cls) -> PipelineOptions: - pass - - @classmethod - @abstractmethod - def is_backend_supported(cls, backend: AbstractDocumentBackend): - pass - - # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]: - # for model in self.model_pipe: - # element_batch = model(element_batch) - # - # yield from element_batch - - -class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name. - - def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: - for model in self.model_pipe: - page_batch = model(page_batch) - - yield from page_batch - - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: - - if not isinstance(in_doc._backend, PdfDocumentBackend): - raise RuntimeError( - f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. " - f"Can not convert this with a PDF pipeline. " - f"Please check your format configuration on DocumentConverter." - ) - # conv_res.status = ConversionStatus.FAILURE - # return conv_res - - for i in range(0, in_doc.page_count): - conv_res.pages.append(Page(page_no=i)) - - try: - # Iterate batches of pages (page_batch_size) in the doc - for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size): - start_pb_time = time.time() - - # 1. Initialise the page resources - init_pages = map( - functools.partial(self.initialize_page, in_doc), page_batch - ) - - # 2. Run pipeline stages - pipeline_pages = self._apply_on_pages(init_pages) - - for p in pipeline_pages: # Must exhaust! - pass - - end_pb_time = time.time() - start_pb_time - _log.info(f"Finished converting page batch time={end_pb_time:.3f}") - - except Exception as e: - conv_res.status = ConversionStatus.FAILURE - trace = "\n".join(traceback.format_exception(e)) - _log.warning( - f"Encountered an error during conversion of document {in_doc.document_hash}:\n" - f"{trace}" - ) - # raise e # TODO Debug, should not be here. - finally: - # Always unload the PDF backend, even in case of failure - if in_doc._backend: - in_doc._backend.unload() - - return conv_res - - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: - status = ConversionStatus.SUCCESS - for page in conv_res.pages: - if not page._backend.is_valid(): - conv_res.errors.append( - ErrorItem( - component_type=DoclingComponentType.DOCUMENT_BACKEND, - module_name=type(page._backend).__name__, - error_message=f"Page {page.page_no} failed to parse.", - ) - ) - status = ConversionStatus.PARTIAL_SUCCESS - - return status - - # Initialise and load resources for a page - @abstractmethod - def initialize_page(self, doc: InputDocument, page: Page) -> Page: - pass diff --git a/docling/pipeline/simple_model_pipeline.py b/docling/pipeline/simple_model_pipeline.py deleted file mode 100644 index ee5febab..00000000 --- a/docling/pipeline/simple_model_pipeline.py +++ /dev/null @@ -1,59 +0,0 @@ -import logging - -from docling.backend.abstract_backend import ( - AbstractDocumentBackend, - DeclarativeDocumentBackend, -) -from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import PipelineOptions -from docling.pipeline.base_model_pipeline import AbstractModelPipeline - -_log = logging.getLogger(__name__) - - -class SimpleModelPipeline(AbstractModelPipeline): - """SimpleModelPipeline. - - This class is used at the moment for formats / backends - which produce straight DoclingDocument output. - """ - - def __init__(self, pipeline_options: PipelineOptions): - super().__init__(pipeline_options) - - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: - - if not isinstance(in_doc._backend, DeclarativeDocumentBackend): - raise RuntimeError( - f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. " - f"Can not convert this with simple pipeline. " - f"Please check your format configuration on DocumentConverter." - ) - # conv_res.status = ConversionStatus.FAILURE - # return conv_res - - # Instead of running a page-level pipeline to build up the document structure, - # the backend is expected to be of type DeclarativeDocumentBackend, which can output - # a DoclingDocument straight. - - conv_res.output = in_doc._backend.convert() - return conv_res - - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: - # This is called only if the previous steps didn't raise. - # Since we don't have anything else to evaluate, we can - # safely return SUCCESS. - return ConversionStatus.SUCCESS - - @classmethod - def get_default_options(cls) -> PipelineOptions: - return PipelineOptions() - - @classmethod - def is_backend_supported(cls, backend: AbstractDocumentBackend): - return isinstance(backend, DeclarativeDocumentBackend) diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py deleted file mode 100644 index 53558579..00000000 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ /dev/null @@ -1,157 +0,0 @@ -import logging -from pathlib import Path -from typing import Optional - -from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import AssembledUnit, Page -from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import ( - EasyOcrOptions, - PdfPipelineOptions, - TesseractCliOcrOptions, - TesseractOcrOptions, -) -from docling.models.base_ocr_model import BaseOcrModel -from docling.models.ds_glm_model import GlmModel, GlmOptions -from docling.models.easyocr_model import EasyOcrModel -from docling.models.layout_model import LayoutModel -from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions -from docling.models.page_preprocessing_model import ( - PagePreprocessingModel, - PagePreprocessingOptions, -) -from docling.models.table_structure_model import TableStructureModel -from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel -from docling.models.tesseract_ocr_model import TesseractOcrModel -from docling.pipeline.base_model_pipeline import PaginatedModelPipeline - -_log = logging.getLogger(__name__) - - -class StandardPdfModelPipeline(PaginatedModelPipeline): - _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt" - _table_model_path = "model_artifacts/tableformer" - - def __init__(self, pipeline_options: PdfPipelineOptions): - super().__init__(pipeline_options) - self.pipeline_options: PdfPipelineOptions - - if not pipeline_options.artifacts_path: - artifacts_path = self.download_models_hf() - - self.artifacts_path = Path(artifacts_path) - self.glm_model = GlmModel( - options=GlmOptions( - create_legacy_output=pipeline_options.create_legacy_output - ) - ) - - if (ocr_model := self.get_ocr_model()) is None: - raise RuntimeError( - f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." - ) - - self.model_pipe = [ - # Pre-processing - PagePreprocessingModel( - options=PagePreprocessingOptions( - images_scale=pipeline_options.images_scale - ) - ), - # OCR - ocr_model, - # Layout model - LayoutModel( - artifacts_path=artifacts_path - / StandardPdfModelPipeline._layout_model_path - ), - # Table structure model - TableStructureModel( - enabled=pipeline_options.do_table_structure, - artifacts_path=artifacts_path - / StandardPdfModelPipeline._table_model_path, - options=pipeline_options.table_structure_options, - ), - # Page assemble - PageAssembleModel( - options=PageAssembleOptions( - keep_images=pipeline_options.images_scale is not None - ) - ), - ] - - self.enrichment_pipe = [ - # Other models working on `NodeItem` elements in the DoclingDocument - ] - - @staticmethod - def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False - ) -> Path: - from huggingface_hub import snapshot_download - - download_path = snapshot_download( - repo_id="ds4sd/docling-models", - force_download=force, - local_dir=local_dir, - revision="v2.0.1", - ) - - return Path(download_path) - - def get_ocr_model(self) -> Optional[BaseOcrModel]: - if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions): - return EasyOcrModel( - enabled=self.pipeline_options.do_ocr, - options=self.pipeline_options.ocr_options, - ) - elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions): - return TesseractOcrCliModel( - enabled=self.pipeline_options.do_ocr, - options=self.pipeline_options.ocr_options, - ) - elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions): - return TesseractOcrModel( - enabled=self.pipeline_options.do_ocr, - options=self.pipeline_options.ocr_options, - ) - return None - - def initialize_page(self, doc: InputDocument, page: Page) -> Page: - page._backend = doc._backend.load_page(page.page_no) - page.size = page._backend.get_size() - - return page - - def _assemble_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: - all_elements = [] - all_headers = [] - all_body = [] - - for p in conv_res.pages: - - for el in p.assembled.body: - all_body.append(el) - for el in p.assembled.headers: - all_headers.append(el) - for el in p.assembled.elements: - all_elements.append(el) - - conv_res.assembled = AssembledUnit( - elements=all_elements, headers=all_headers, body=all_body - ) - - conv_res.output, conv_res.legacy_output = self.glm_model(conv_res) - - return conv_res - - @classmethod - def get_default_options(cls) -> PdfPipelineOptions: - return PdfPipelineOptions() - - @classmethod - def is_backend_supported(cls, backend: AbstractDocumentBackend): - return isinstance(backend, PdfDocumentBackend) diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 70d86520..75a578c7 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -12,7 +12,7 @@ from docling.datamodel.pipeline_options import ( TesseractOcrOptions, ) from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption -from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline _log = logging.getLogger(__name__) diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index f93db241..534daf35 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -9,8 +9,8 @@ from docling.document_converter import ( PdfFormatOption, WordFormatOption, ) -from docling.pipeline.simple_model_pipeline import SimpleModelPipeline -from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline +from docling.pipeline.simple_pipeline import SimplePipeline +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline _log = logging.getLogger(__name__) @@ -30,7 +30,7 @@ input_paths = [ ## to customize use: doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. - formats=[ + allowed_formats=[ InputFormat.PDF, # InputFormat.IMAGE, InputFormat.DOCX, @@ -39,10 +39,10 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal ], # whitelist formats, other files are ignored. format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend + pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend ), # PdfFormatOption(backend=PyPdfiumDocumentBackend), InputFormat.DOCX: WordFormatOption( - pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend + pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ), # InputFormat.IMAGE: PdfFormatOption(), }, @@ -51,9 +51,9 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. pdf=None, docx=WordFormatOption( - pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend + pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ), - formats=[ + allowed_formats=[ InputFormat.PDF, # InputFormat.IMAGE, InputFormat.DOCX, @@ -62,10 +62,10 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal ], # whitelist formats, other files are ignored. format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend + pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend ), # PdfFormatOption(backend=PyPdfiumDocumentBackend), InputFormat.DOCX: WordFormatOption( - pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend + pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ), # InputFormat.IMAGE: PdfFormatOption(), },