rename and refactor *model*

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-11 16:57:40 +02:00
parent 6c9f869dc7
commit 98f1a4597e
6 changed files with 32 additions and 417 deletions

View File

@ -19,16 +19,16 @@ from docling.datamodel.document import (
) )
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings from docling.datamodel.settings import DocumentLimits, settings
from docling.pipeline.base_model_pipeline import AbstractModelPipeline from docling.pipeline.base_pipeline import AbstractPipeline
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.utils.utils import chunkify from docling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class FormatOption(BaseModel): class FormatOption(BaseModel):
pipeline_cls: Type[AbstractModelPipeline] pipeline_cls: Type[AbstractPipeline]
pipeline_options: Optional[PipelineOptions] = None pipeline_options: Optional[PipelineOptions] = None
backend: Type[AbstractDocumentBackend] backend: Type[AbstractDocumentBackend]
@ -42,40 +42,40 @@ class FormatOption(BaseModel):
class WordFormatOption(FormatOption): class WordFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
class PowerpointFormatOption(FormatOption): class PowerpointFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
class HTMLFormatOption(FormatOption): class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
class PdfFormatOption(FormatOption): class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfModelPipeline pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
_format_to_default_options = { _format_to_default_options = {
InputFormat.DOCX: FormatOption( InputFormat.DOCX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
), ),
InputFormat.PPTX: FormatOption( InputFormat.PPTX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
), ),
InputFormat.HTML: FormatOption( InputFormat.HTML: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
), ),
InputFormat.IMAGE: FormatOption( InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
), ),
InputFormat.PDF: FormatOption( InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
), ),
} }
@ -85,29 +85,27 @@ class DocumentConverter:
def __init__( def __init__(
self, self,
formats: Optional[List[InputFormat]] = None, allowed_formats: Optional[List[InputFormat]] = None,
format_options: Optional[Dict[InputFormat, FormatOption]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None,
): ):
self.formats = formats self.allowed_formats = allowed_formats
self.format_to_options = format_options self.format_to_options = format_options
if self.formats is None: if self.allowed_formats is None:
if self.format_to_options is not None: if self.format_to_options is not None:
self.formats = self.format_to_options.keys() self.allowed_formats = self.format_to_options.keys()
else: else:
self.formats = [e for e in InputFormat] # all formats self.allowed_formats = [e for e in InputFormat] # all formats
if self.format_to_options is None: if self.format_to_options is None:
self.format_to_options = _format_to_default_options self.format_to_options = _format_to_default_options
for f in self.formats: for f in self.allowed_formats:
if f not in self.format_to_options.keys(): if f not in self.format_to_options.keys():
_log.info(f"Requested format {f} will use default options.") _log.info(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f] self.format_to_options[f] = _format_to_default_options[f]
self.initialized_pipelines: Dict[ self.initialized_pipelines: Dict[Type[AbstractPipeline], AbstractPipeline] = {}
Type[AbstractModelPipeline], AbstractModelPipeline
] = {}
@validate_call(config=ConfigDict(strict=True)) @validate_call(config=ConfigDict(strict=True))
def convert( def convert(
@ -173,7 +171,7 @@ class DocumentConverter:
if item is not None: if item is not None:
yield item yield item
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]: def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractPipeline]:
fopt = self.format_to_options.get(doc.format) fopt = self.format_to_options.get(doc.format)
if fopt is None: if fopt is None:
@ -194,7 +192,7 @@ class DocumentConverter:
return self.initialized_pipelines[pipeline_class] return self.initialized_pipelines[pipeline_class]
def process_document(self, in_doc: InputDocument) -> ConversionResult: def process_document(self, in_doc: InputDocument) -> ConversionResult:
if in_doc.format not in self.formats: if in_doc.format not in self.allowed_formats:
return None return None
else: else:
start_doc_time = time.time() start_doc_time = time.time()

View File

@ -1,167 +0,0 @@
import functools
import logging
import time
import traceback
from abc import ABC, abstractmethod
from typing import Callable, Iterable, List
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class AbstractModelPipeline(ABC):
def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options
self.model_pipe: List[Callable] = []
self.enrichment_pipe: List[Callable] = []
def execute(self, in_doc: InputDocument) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid:
conv_res.status = ConversionStatus.FAILURE
return conv_res
# TODO: propagate option for raises_on_error?
try:
conv_res = self._build_document(in_doc, conv_res)
conv_res = self._assemble_document(in_doc, conv_res)
conv_res = self._enrich_document(in_doc, conv_res)
conv_res.status = self._determine_status(in_doc, conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
return conv_res
@abstractmethod
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
pass
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
return conv_res
def _enrich_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
return conv_res
@abstractmethod
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
pass
@classmethod
@abstractmethod
def get_default_options(cls) -> PipelineOptions:
pass
@classmethod
@abstractmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
# for model in self.model_pipe:
# element_batch = model(element_batch)
#
# yield from element_batch
class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name.
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe:
page_batch = model(page_batch)
yield from page_batch
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
if not isinstance(in_doc._backend, PdfDocumentBackend):
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
)
# 2. Run pipeline stages
pipeline_pages = self._apply_on_pages(init_pages)
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.warning(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
# raise e # TODO Debug, should not be here.
finally:
# Always unload the PDF backend, even in case of failure
if in_doc._backend:
in_doc._backend.unload()
return conv_res
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.DOCUMENT_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
return status
# Initialise and load resources for a page
@abstractmethod
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
pass

View File

@ -1,59 +0,0 @@
import logging
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
_log = logging.getLogger(__name__)
class SimpleModelPipeline(AbstractModelPipeline):
"""SimpleModelPipeline.
This class is used at the moment for formats / backends
which produce straight DoclingDocument output.
"""
def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options)
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
f"Can not convert this with simple pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
# Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
# a DoclingDocument straight.
conv_res.output = in_doc._backend.convert()
return conv_res
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
# This is called only if the previous steps didn't raise.
# Since we don't have anything else to evaluate, we can
# safely return SUCCESS.
return ConversionStatus.SUCCESS
@classmethod
def get_default_options(cls) -> PipelineOptions:
return PipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, DeclarativeDocumentBackend)

View File

@ -1,157 +0,0 @@
import logging
from pathlib import Path
from typing import Optional
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
_log = logging.getLogger(__name__)
class StandardPdfModelPipeline(PaginatedModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions
if not pipeline_options.artifacts_path:
artifacts_path = self.download_models_hf()
self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel(
options=GlmOptions(
create_legacy_output=pipeline_options.create_legacy_output
)
)
if (ocr_model := self.get_ocr_model()) is None:
raise RuntimeError(
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
)
self.model_pipe = [
# Pre-processing
PagePreprocessingModel(
options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale
)
),
# OCR
ocr_model,
# Layout model
LayoutModel(
artifacts_path=artifacts_path
/ StandardPdfModelPipeline._layout_model_path
),
# Table structure model
TableStructureModel(
enabled=pipeline_options.do_table_structure,
artifacts_path=artifacts_path
/ StandardPdfModelPipeline._table_model_path,
options=pipeline_options.table_structure_options,
),
# Page assemble
PageAssembleModel(
options=PageAssembleOptions(
keep_images=pipeline_options.images_scale is not None
)
),
]
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument
]
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.0.1",
)
return Path(download_path)
def get_ocr_model(self) -> Optional[BaseOcrModel]:
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
return EasyOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
return TesseractOcrCliModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
return TesseractOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
return None
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no)
page.size = page._backend.get_size()
return page
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
return conv_res
@classmethod
def get_default_options(cls) -> PdfPipelineOptions:
return PdfPipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, PdfDocumentBackend)

View File

@ -12,7 +12,7 @@ from docling.datamodel.pipeline_options import (
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -9,8 +9,8 @@ from docling.document_converter import (
PdfFormatOption, PdfFormatOption,
WordFormatOption, WordFormatOption,
) )
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -30,7 +30,7 @@ input_paths = [
## to customize use: ## to customize use:
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
formats=[ allowed_formats=[
InputFormat.PDF, InputFormat.PDF,
# InputFormat.IMAGE, # InputFormat.IMAGE,
InputFormat.DOCX, InputFormat.DOCX,
@ -39,10 +39,10 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
], # whitelist formats, other files are ignored. ], # whitelist formats, other files are ignored.
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend), ), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption( InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
), ),
# InputFormat.IMAGE: PdfFormatOption(), # InputFormat.IMAGE: PdfFormatOption(),
}, },
@ -51,9 +51,9 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
pdf=None, pdf=None,
docx=WordFormatOption( docx=WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
), ),
formats=[ allowed_formats=[
InputFormat.PDF, InputFormat.PDF,
# InputFormat.IMAGE, # InputFormat.IMAGE,
InputFormat.DOCX, InputFormat.DOCX,
@ -62,10 +62,10 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
], # whitelist formats, other files are ignored. ], # whitelist formats, other files are ignored.
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend), ), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption( InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
), ),
# InputFormat.IMAGE: PdfFormatOption(), # InputFormat.IMAGE: PdfFormatOption(),
}, },