mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
rename and refactor *model*
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
6c9f869dc7
commit
98f1a4597e
@ -19,16 +19,16 @@ from docling.datamodel.document import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import DocumentLimits, settings
|
from docling.datamodel.settings import DocumentLimits, settings
|
||||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
from docling.pipeline.base_pipeline import AbstractPipeline
|
||||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
from docling.utils.utils import chunkify
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class FormatOption(BaseModel):
|
class FormatOption(BaseModel):
|
||||||
pipeline_cls: Type[AbstractModelPipeline]
|
pipeline_cls: Type[AbstractPipeline]
|
||||||
pipeline_options: Optional[PipelineOptions] = None
|
pipeline_options: Optional[PipelineOptions] = None
|
||||||
backend: Type[AbstractDocumentBackend]
|
backend: Type[AbstractDocumentBackend]
|
||||||
|
|
||||||
@ -42,40 +42,40 @@ class FormatOption(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class WordFormatOption(FormatOption):
|
class WordFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = SimpleModelPipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class PowerpointFormatOption(FormatOption):
|
class PowerpointFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = SimpleModelPipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class HTMLFormatOption(FormatOption):
|
class HTMLFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = SimpleModelPipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class PdfFormatOption(FormatOption):
|
class PdfFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = StandardPdfModelPipeline
|
pipeline_cls: Type = StandardPdfPipeline
|
||||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
_format_to_default_options = {
|
_format_to_default_options = {
|
||||||
InputFormat.DOCX: FormatOption(
|
InputFormat.DOCX: FormatOption(
|
||||||
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
|
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||||
),
|
),
|
||||||
InputFormat.PPTX: FormatOption(
|
InputFormat.PPTX: FormatOption(
|
||||||
pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
|
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||||
),
|
),
|
||||||
InputFormat.HTML: FormatOption(
|
InputFormat.HTML: FormatOption(
|
||||||
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||||
),
|
),
|
||||||
InputFormat.IMAGE: FormatOption(
|
InputFormat.IMAGE: FormatOption(
|
||||||
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||||
),
|
),
|
||||||
InputFormat.PDF: FormatOption(
|
InputFormat.PDF: FormatOption(
|
||||||
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,29 +85,27 @@ class DocumentConverter:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
formats: Optional[List[InputFormat]] = None,
|
allowed_formats: Optional[List[InputFormat]] = None,
|
||||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||||
):
|
):
|
||||||
self.formats = formats
|
self.allowed_formats = allowed_formats
|
||||||
self.format_to_options = format_options
|
self.format_to_options = format_options
|
||||||
|
|
||||||
if self.formats is None:
|
if self.allowed_formats is None:
|
||||||
if self.format_to_options is not None:
|
if self.format_to_options is not None:
|
||||||
self.formats = self.format_to_options.keys()
|
self.allowed_formats = self.format_to_options.keys()
|
||||||
else:
|
else:
|
||||||
self.formats = [e for e in InputFormat] # all formats
|
self.allowed_formats = [e for e in InputFormat] # all formats
|
||||||
|
|
||||||
if self.format_to_options is None:
|
if self.format_to_options is None:
|
||||||
self.format_to_options = _format_to_default_options
|
self.format_to_options = _format_to_default_options
|
||||||
|
|
||||||
for f in self.formats:
|
for f in self.allowed_formats:
|
||||||
if f not in self.format_to_options.keys():
|
if f not in self.format_to_options.keys():
|
||||||
_log.info(f"Requested format {f} will use default options.")
|
_log.info(f"Requested format {f} will use default options.")
|
||||||
self.format_to_options[f] = _format_to_default_options[f]
|
self.format_to_options[f] = _format_to_default_options[f]
|
||||||
|
|
||||||
self.initialized_pipelines: Dict[
|
self.initialized_pipelines: Dict[Type[AbstractPipeline], AbstractPipeline] = {}
|
||||||
Type[AbstractModelPipeline], AbstractModelPipeline
|
|
||||||
] = {}
|
|
||||||
|
|
||||||
@validate_call(config=ConfigDict(strict=True))
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
def convert(
|
def convert(
|
||||||
@ -173,7 +171,7 @@ class DocumentConverter:
|
|||||||
if item is not None:
|
if item is not None:
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
|
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractPipeline]:
|
||||||
fopt = self.format_to_options.get(doc.format)
|
fopt = self.format_to_options.get(doc.format)
|
||||||
|
|
||||||
if fopt is None:
|
if fopt is None:
|
||||||
@ -194,7 +192,7 @@ class DocumentConverter:
|
|||||||
return self.initialized_pipelines[pipeline_class]
|
return self.initialized_pipelines[pipeline_class]
|
||||||
|
|
||||||
def process_document(self, in_doc: InputDocument) -> ConversionResult:
|
def process_document(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
if in_doc.format not in self.formats:
|
if in_doc.format not in self.allowed_formats:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
start_doc_time = time.time()
|
start_doc_time = time.time()
|
||||||
|
@ -1,167 +0,0 @@
|
|||||||
import functools
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
import traceback
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Callable, Iterable, List
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
||||||
from docling.datamodel.base_models import (
|
|
||||||
ConversionStatus,
|
|
||||||
DoclingComponentType,
|
|
||||||
ErrorItem,
|
|
||||||
Page,
|
|
||||||
)
|
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.utils.utils import chunkify
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class AbstractModelPipeline(ABC):
|
|
||||||
def __init__(self, pipeline_options: PipelineOptions):
|
|
||||||
self.pipeline_options = pipeline_options
|
|
||||||
self.model_pipe: List[Callable] = []
|
|
||||||
self.enrichment_pipe: List[Callable] = []
|
|
||||||
|
|
||||||
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
|
||||||
conv_res = ConversionResult(input=in_doc)
|
|
||||||
|
|
||||||
_log.info(f"Processing document {in_doc.file.name}")
|
|
||||||
|
|
||||||
if not in_doc.valid:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
# TODO: propagate option for raises_on_error?
|
|
||||||
try:
|
|
||||||
conv_res = self._build_document(in_doc, conv_res)
|
|
||||||
conv_res = self._assemble_document(in_doc, conv_res)
|
|
||||||
conv_res = self._enrich_document(in_doc, conv_res)
|
|
||||||
conv_res.status = self._determine_status(in_doc, conv_res)
|
|
||||||
except Exception as e:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def _build_document(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _assemble_document(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
def _enrich_document(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def _determine_status(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionStatus:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
@abstractmethod
|
|
||||||
def get_default_options(cls) -> PipelineOptions:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
@abstractmethod
|
|
||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
|
||||||
# for model in self.model_pipe:
|
|
||||||
# element_batch = model(element_batch)
|
|
||||||
#
|
|
||||||
# yield from element_batch
|
|
||||||
|
|
||||||
|
|
||||||
class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name.
|
|
||||||
|
|
||||||
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
|
||||||
for model in self.model_pipe:
|
|
||||||
page_batch = model(page_batch)
|
|
||||||
|
|
||||||
yield from page_batch
|
|
||||||
|
|
||||||
def _build_document(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
|
|
||||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
|
||||||
f"Can not convert this with a PDF pipeline. "
|
|
||||||
f"Please check your format configuration on DocumentConverter."
|
|
||||||
)
|
|
||||||
# conv_res.status = ConversionStatus.FAILURE
|
|
||||||
# return conv_res
|
|
||||||
|
|
||||||
for i in range(0, in_doc.page_count):
|
|
||||||
conv_res.pages.append(Page(page_no=i))
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Iterate batches of pages (page_batch_size) in the doc
|
|
||||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
|
||||||
start_pb_time = time.time()
|
|
||||||
|
|
||||||
# 1. Initialise the page resources
|
|
||||||
init_pages = map(
|
|
||||||
functools.partial(self.initialize_page, in_doc), page_batch
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2. Run pipeline stages
|
|
||||||
pipeline_pages = self._apply_on_pages(init_pages)
|
|
||||||
|
|
||||||
for p in pipeline_pages: # Must exhaust!
|
|
||||||
pass
|
|
||||||
|
|
||||||
end_pb_time = time.time() - start_pb_time
|
|
||||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
trace = "\n".join(traceback.format_exception(e))
|
|
||||||
_log.warning(
|
|
||||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
|
||||||
f"{trace}"
|
|
||||||
)
|
|
||||||
# raise e # TODO Debug, should not be here.
|
|
||||||
finally:
|
|
||||||
# Always unload the PDF backend, even in case of failure
|
|
||||||
if in_doc._backend:
|
|
||||||
in_doc._backend.unload()
|
|
||||||
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
def _determine_status(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionStatus:
|
|
||||||
status = ConversionStatus.SUCCESS
|
|
||||||
for page in conv_res.pages:
|
|
||||||
if not page._backend.is_valid():
|
|
||||||
conv_res.errors.append(
|
|
||||||
ErrorItem(
|
|
||||||
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
|
||||||
module_name=type(page._backend).__name__,
|
|
||||||
error_message=f"Page {page.page_no} failed to parse.",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
status = ConversionStatus.PARTIAL_SUCCESS
|
|
||||||
|
|
||||||
return status
|
|
||||||
|
|
||||||
# Initialise and load resources for a page
|
|
||||||
@abstractmethod
|
|
||||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
|
||||||
pass
|
|
@ -1,59 +0,0 @@
|
|||||||
import logging
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
|
||||||
AbstractDocumentBackend,
|
|
||||||
DeclarativeDocumentBackend,
|
|
||||||
)
|
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
|
||||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleModelPipeline(AbstractModelPipeline):
|
|
||||||
"""SimpleModelPipeline.
|
|
||||||
|
|
||||||
This class is used at the moment for formats / backends
|
|
||||||
which produce straight DoclingDocument output.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, pipeline_options: PipelineOptions):
|
|
||||||
super().__init__(pipeline_options)
|
|
||||||
|
|
||||||
def _build_document(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
|
|
||||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
|
|
||||||
f"Can not convert this with simple pipeline. "
|
|
||||||
f"Please check your format configuration on DocumentConverter."
|
|
||||||
)
|
|
||||||
# conv_res.status = ConversionStatus.FAILURE
|
|
||||||
# return conv_res
|
|
||||||
|
|
||||||
# Instead of running a page-level pipeline to build up the document structure,
|
|
||||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
|
||||||
# a DoclingDocument straight.
|
|
||||||
|
|
||||||
conv_res.output = in_doc._backend.convert()
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
def _determine_status(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionStatus:
|
|
||||||
# This is called only if the previous steps didn't raise.
|
|
||||||
# Since we don't have anything else to evaluate, we can
|
|
||||||
# safely return SUCCESS.
|
|
||||||
return ConversionStatus.SUCCESS
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_default_options(cls) -> PipelineOptions:
|
|
||||||
return PipelineOptions()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
||||||
return isinstance(backend, DeclarativeDocumentBackend)
|
|
@ -1,157 +0,0 @@
|
|||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
|
||||||
from docling.datamodel.pipeline_options import (
|
|
||||||
EasyOcrOptions,
|
|
||||||
PdfPipelineOptions,
|
|
||||||
TesseractCliOcrOptions,
|
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
|
||||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
|
||||||
from docling.models.layout_model import LayoutModel
|
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
|
||||||
from docling.models.page_preprocessing_model import (
|
|
||||||
PagePreprocessingModel,
|
|
||||||
PagePreprocessingOptions,
|
|
||||||
)
|
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
|
||||||
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|
||||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
|
||||||
_table_model_path = "model_artifacts/tableformer"
|
|
||||||
|
|
||||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
|
||||||
super().__init__(pipeline_options)
|
|
||||||
self.pipeline_options: PdfPipelineOptions
|
|
||||||
|
|
||||||
if not pipeline_options.artifacts_path:
|
|
||||||
artifacts_path = self.download_models_hf()
|
|
||||||
|
|
||||||
self.artifacts_path = Path(artifacts_path)
|
|
||||||
self.glm_model = GlmModel(
|
|
||||||
options=GlmOptions(
|
|
||||||
create_legacy_output=pipeline_options.create_legacy_output
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if (ocr_model := self.get_ocr_model()) is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.model_pipe = [
|
|
||||||
# Pre-processing
|
|
||||||
PagePreprocessingModel(
|
|
||||||
options=PagePreprocessingOptions(
|
|
||||||
images_scale=pipeline_options.images_scale
|
|
||||||
)
|
|
||||||
),
|
|
||||||
# OCR
|
|
||||||
ocr_model,
|
|
||||||
# Layout model
|
|
||||||
LayoutModel(
|
|
||||||
artifacts_path=artifacts_path
|
|
||||||
/ StandardPdfModelPipeline._layout_model_path
|
|
||||||
),
|
|
||||||
# Table structure model
|
|
||||||
TableStructureModel(
|
|
||||||
enabled=pipeline_options.do_table_structure,
|
|
||||||
artifacts_path=artifacts_path
|
|
||||||
/ StandardPdfModelPipeline._table_model_path,
|
|
||||||
options=pipeline_options.table_structure_options,
|
|
||||||
),
|
|
||||||
# Page assemble
|
|
||||||
PageAssembleModel(
|
|
||||||
options=PageAssembleOptions(
|
|
||||||
keep_images=pipeline_options.images_scale is not None
|
|
||||||
)
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
|
||||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
|
||||||
]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def download_models_hf(
|
|
||||||
local_dir: Optional[Path] = None, force: bool = False
|
|
||||||
) -> Path:
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id="ds4sd/docling-models",
|
|
||||||
force_download=force,
|
|
||||||
local_dir=local_dir,
|
|
||||||
revision="v2.0.1",
|
|
||||||
)
|
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def get_ocr_model(self) -> Optional[BaseOcrModel]:
|
|
||||||
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
|
|
||||||
return EasyOcrModel(
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
|
||||||
return TesseractOcrCliModel(
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
|
|
||||||
return TesseractOcrModel(
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
|
||||||
page._backend = doc._backend.load_page(page.page_no)
|
|
||||||
page.size = page._backend.get_size()
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
def _assemble_document(
|
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
all_elements = []
|
|
||||||
all_headers = []
|
|
||||||
all_body = []
|
|
||||||
|
|
||||||
for p in conv_res.pages:
|
|
||||||
|
|
||||||
for el in p.assembled.body:
|
|
||||||
all_body.append(el)
|
|
||||||
for el in p.assembled.headers:
|
|
||||||
all_headers.append(el)
|
|
||||||
for el in p.assembled.elements:
|
|
||||||
all_elements.append(el)
|
|
||||||
|
|
||||||
conv_res.assembled = AssembledUnit(
|
|
||||||
elements=all_elements, headers=all_headers, body=all_body
|
|
||||||
)
|
|
||||||
|
|
||||||
conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
|
|
||||||
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_default_options(cls) -> PdfPipelineOptions:
|
|
||||||
return PdfPipelineOptions()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
||||||
return isinstance(backend, PdfDocumentBackend)
|
|
@ -12,7 +12,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -9,8 +9,8 @@ from docling.document_converter import (
|
|||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
WordFormatOption,
|
WordFormatOption,
|
||||||
)
|
)
|
||||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ input_paths = [
|
|||||||
|
|
||||||
## to customize use:
|
## to customize use:
|
||||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
formats=[
|
allowed_formats=[
|
||||||
InputFormat.PDF,
|
InputFormat.PDF,
|
||||||
# InputFormat.IMAGE,
|
# InputFormat.IMAGE,
|
||||||
InputFormat.DOCX,
|
InputFormat.DOCX,
|
||||||
@ -39,10 +39,10 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
], # whitelist formats, other files are ignored.
|
], # whitelist formats, other files are ignored.
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||||
InputFormat.DOCX: WordFormatOption(
|
InputFormat.DOCX: WordFormatOption(
|
||||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||||
),
|
),
|
||||||
# InputFormat.IMAGE: PdfFormatOption(),
|
# InputFormat.IMAGE: PdfFormatOption(),
|
||||||
},
|
},
|
||||||
@ -51,9 +51,9 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
pdf=None,
|
pdf=None,
|
||||||
docx=WordFormatOption(
|
docx=WordFormatOption(
|
||||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||||
),
|
),
|
||||||
formats=[
|
allowed_formats=[
|
||||||
InputFormat.PDF,
|
InputFormat.PDF,
|
||||||
# InputFormat.IMAGE,
|
# InputFormat.IMAGE,
|
||||||
InputFormat.DOCX,
|
InputFormat.DOCX,
|
||||||
@ -62,10 +62,10 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
], # whitelist formats, other files are ignored.
|
], # whitelist formats, other files are ignored.
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||||
InputFormat.DOCX: WordFormatOption(
|
InputFormat.DOCX: WordFormatOption(
|
||||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||||
),
|
),
|
||||||
# InputFormat.IMAGE: PdfFormatOption(),
|
# InputFormat.IMAGE: PdfFormatOption(),
|
||||||
},
|
},
|
||||||
|
Loading…
Reference in New Issue
Block a user