mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: enrichment steps on all convert pipelines (incl docx, html, etc) (#2251)
* allow enrichment on all convert pipelines Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * set options in CLI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AsrPipelineOptions,
|
AsrPipelineOptions,
|
||||||
|
ConvertPipelineOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
PaginatedPipelineOptions,
|
PaginatedPipelineOptions,
|
||||||
@@ -71,8 +72,13 @@ from docling.datamodel.vlm_model_specs import (
|
|||||||
from docling.document_converter import (
|
from docling.document_converter import (
|
||||||
AudioFormatOption,
|
AudioFormatOption,
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
|
ExcelFormatOption,
|
||||||
FormatOption,
|
FormatOption,
|
||||||
|
HTMLFormatOption,
|
||||||
|
MarkdownFormatOption,
|
||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
|
PowerpointFormatOption,
|
||||||
|
WordFormatOption,
|
||||||
)
|
)
|
||||||
from docling.models.factories import get_ocr_factory
|
from docling.models.factories import get_ocr_factory
|
||||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||||
@@ -626,10 +632,33 @@ def convert( # noqa: C901
|
|||||||
backend=MetsGbsDocumentBackend,
|
backend=MetsGbsDocumentBackend,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# SimplePipeline options
|
||||||
|
simple_format_option = ConvertPipelineOptions(
|
||||||
|
do_picture_description=enrich_picture_description,
|
||||||
|
do_picture_classification=enrich_picture_classes,
|
||||||
|
)
|
||||||
|
if artifacts_path is not None:
|
||||||
|
simple_format_option.artifacts_path = artifacts_path
|
||||||
|
|
||||||
format_options = {
|
format_options = {
|
||||||
InputFormat.PDF: pdf_format_option,
|
InputFormat.PDF: pdf_format_option,
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
InputFormat.IMAGE: pdf_format_option,
|
||||||
InputFormat.METS_GBS: mets_gbs_format_option,
|
InputFormat.METS_GBS: mets_gbs_format_option,
|
||||||
|
InputFormat.DOCX: WordFormatOption(
|
||||||
|
pipeline_options=simple_format_option
|
||||||
|
),
|
||||||
|
InputFormat.PPTX: PowerpointFormatOption(
|
||||||
|
pipeline_options=simple_format_option
|
||||||
|
),
|
||||||
|
InputFormat.XLSX: ExcelFormatOption(
|
||||||
|
pipeline_options=simple_format_option
|
||||||
|
),
|
||||||
|
InputFormat.HTML: HTMLFormatOption(
|
||||||
|
pipeline_options=simple_format_option
|
||||||
|
),
|
||||||
|
InputFormat.MD: MarkdownFormatOption(
|
||||||
|
pipeline_options=simple_format_option
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
elif pipeline == ProcessingPipeline.VLM:
|
elif pipeline == ProcessingPipeline.VLM:
|
||||||
|
|||||||
@@ -259,11 +259,21 @@ class PipelineOptions(BaseOptions):
|
|||||||
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
||||||
enable_remote_services: bool = False
|
enable_remote_services: bool = False
|
||||||
allow_external_plugins: bool = False
|
allow_external_plugins: bool = False
|
||||||
|
|
||||||
|
|
||||||
class PaginatedPipelineOptions(PipelineOptions):
|
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ConvertPipelineOptions(PipelineOptions):
|
||||||
|
"""Base convert pipeline options."""
|
||||||
|
|
||||||
|
do_picture_classification: bool = False # True: classify pictures in documents
|
||||||
|
|
||||||
|
do_picture_description: bool = False # True: run describe pictures in documents
|
||||||
|
picture_description_options: PictureDescriptionBaseOptions = (
|
||||||
|
smolvlm_picture_description
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PaginatedPipelineOptions(ConvertPipelineOptions):
|
||||||
images_scale: float = 1.0
|
images_scale: float = 1.0
|
||||||
generate_page_images: bool = False
|
generate_page_images: bool = False
|
||||||
generate_picture_images: bool = False
|
generate_picture_images: bool = False
|
||||||
@@ -295,13 +305,11 @@ class LayoutOptions(BaseModel):
|
|||||||
|
|
||||||
class AsrPipelineOptions(PipelineOptions):
|
class AsrPipelineOptions(PipelineOptions):
|
||||||
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
|
||||||
|
|
||||||
|
|
||||||
class VlmExtractionPipelineOptions(PipelineOptions):
|
class VlmExtractionPipelineOptions(PipelineOptions):
|
||||||
"""Options for extraction pipeline."""
|
"""Options for extraction pipeline."""
|
||||||
|
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
|
||||||
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
|
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
|
||||||
|
|
||||||
|
|
||||||
@@ -312,8 +320,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||||
do_code_enrichment: bool = False # True: perform code OCR
|
do_code_enrichment: bool = False # True: perform code OCR
|
||||||
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
||||||
do_picture_classification: bool = False # True: classify pictures in documents
|
|
||||||
do_picture_description: bool = False # True: run describe pictures in documents
|
|
||||||
force_backend_text: bool = (
|
force_backend_text: bool = (
|
||||||
False # (To be used with vlms, or other generative models)
|
False # (To be used with vlms, or other generative models)
|
||||||
)
|
)
|
||||||
@@ -321,9 +327,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
ocr_options: OcrOptions = EasyOcrOptions()
|
ocr_options: OcrOptions = EasyOcrOptions()
|
||||||
picture_description_options: PictureDescriptionBaseOptions = (
|
|
||||||
smolvlm_picture_description
|
|
||||||
)
|
|
||||||
layout_options: LayoutOptions = LayoutOptions()
|
layout_options: LayoutOptions = LayoutOptions()
|
||||||
|
|
||||||
images_scale: float = 1.0
|
images_scale: float = 1.0
|
||||||
|
|||||||
@@ -4,7 +4,13 @@ from collections.abc import Iterable
|
|||||||
from typing import Any, Generic, Optional, Protocol, Type, Union
|
from typing import Any, Generic, Optional, Protocol, Type, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
from docling_core.types.doc import (
|
||||||
|
BoundingBox,
|
||||||
|
DocItem,
|
||||||
|
DoclingDocument,
|
||||||
|
NodeItem,
|
||||||
|
PictureItem,
|
||||||
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
assert isinstance(element, DocItem)
|
assert isinstance(element, DocItem)
|
||||||
element_prov = element.prov[0]
|
|
||||||
|
|
||||||
|
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
|
||||||
|
if len(element.prov) == 0 and isinstance(element, PictureItem):
|
||||||
|
embedded_im = element.get_image(conv_res.document)
|
||||||
|
if embedded_im is not None:
|
||||||
|
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Crop the image form the page
|
||||||
|
element_prov = element.prov[0]
|
||||||
bbox = element_prov.bbox
|
bbox = element_prov.bbox
|
||||||
width = bbox.r - bbox.l
|
width = bbox.r - bbox.l
|
||||||
height = bbox.t - bbox.b
|
height = bbox.t - bbox.b
|
||||||
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
|
|||||||
cropped_image = conv_res.pages[page_ix].get_image(
|
cropped_image = conv_res.pages[page_ix].get_image(
|
||||||
scale=self.images_scale, cropbox=expanded_bbox
|
scale=self.images_scale, cropbox=expanded_bbox
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Allow for images being embedded without the page backend or page images
|
||||||
|
if cropped_image is None and isinstance(element, PictureItem):
|
||||||
|
embedded_im = element.get_image(conv_res.document)
|
||||||
|
if embedded_im is not None:
|
||||||
|
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Return the proper cropped image
|
||||||
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
||||||
|
|||||||
@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
|
|||||||
|
|
||||||
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
||||||
|
|
||||||
artifacts_path: Optional[Path] = None
|
|
||||||
if pipeline_options.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
||||||
elif settings.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
||||||
|
|
||||||
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The value of {artifacts_path=} is not valid. "
|
|
||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
||||||
asr_options: InlineAsrNativeWhisperOptions = (
|
asr_options: InlineAsrNativeWhisperOptions = (
|
||||||
self.pipeline_options.asr_options
|
self.pipeline_options.asr_options
|
||||||
)
|
)
|
||||||
self._model = _NativeWhisperModel(
|
self._model = _NativeWhisperModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
asr_options=asr_options,
|
asr_options=asr_options,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,19 +1,33 @@
|
|||||||
import logging
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
||||||
from docling.datamodel.pipeline_options import BaseOptions
|
from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseExtractionPipeline(ABC):
|
class BaseExtractionPipeline(ABC):
|
||||||
def __init__(self, pipeline_options: BaseOptions):
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
self.pipeline_options = pipeline_options
|
self.pipeline_options = pipeline_options
|
||||||
|
|
||||||
|
self.artifacts_path: Optional[Path] = None
|
||||||
|
if pipeline_options.artifacts_path is not None:
|
||||||
|
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
||||||
|
elif settings.artifacts_path is not None:
|
||||||
|
self.artifacts_path = Path(settings.artifacts_path).expanduser()
|
||||||
|
|
||||||
|
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The value of {self.artifacts_path=} is not valid. "
|
||||||
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
|
)
|
||||||
|
|
||||||
def execute(
|
def execute(
|
||||||
self,
|
self,
|
||||||
in_doc: InputDocument,
|
in_doc: InputDocument,
|
||||||
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_default_options(cls) -> BaseOptions:
|
def get_default_options(cls) -> PipelineOptions:
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -4,7 +4,8 @@ import time
|
|||||||
import traceback
|
import traceback
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from typing import Any, Callable, List
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable, List, Optional
|
||||||
|
|
||||||
from docling_core.types.doc import NodeItem
|
from docling_core.types.doc import NodeItem
|
||||||
|
|
||||||
@@ -20,9 +21,19 @@ from docling.datamodel.base_models import (
|
|||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
ConvertPipelineOptions,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
PipelineOptions,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import GenericEnrichmentModel
|
from docling.models.base_model import GenericEnrichmentModel
|
||||||
|
from docling.models.document_picture_classifier import (
|
||||||
|
DocumentPictureClassifier,
|
||||||
|
DocumentPictureClassifierOptions,
|
||||||
|
)
|
||||||
|
from docling.models.factories import get_picture_description_factory
|
||||||
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
from docling.utils.utils import chunkify
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
@@ -36,6 +47,18 @@ class BasePipeline(ABC):
|
|||||||
self.build_pipe: List[Callable] = []
|
self.build_pipe: List[Callable] = []
|
||||||
self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
|
self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
|
||||||
|
|
||||||
|
self.artifacts_path: Optional[Path] = None
|
||||||
|
if pipeline_options.artifacts_path is not None:
|
||||||
|
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
||||||
|
elif settings.artifacts_path is not None:
|
||||||
|
self.artifacts_path = Path(settings.artifacts_path).expanduser()
|
||||||
|
|
||||||
|
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The value of {self.artifacts_path=} is not valid. "
|
||||||
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
|
)
|
||||||
|
|
||||||
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(input=in_doc)
|
||||||
|
|
||||||
@@ -108,15 +131,58 @@ class BasePipeline(ABC):
|
|||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
|
||||||
# for model in self.build_pipe:
|
class ConvertPipeline(BasePipeline):
|
||||||
# element_batch = model(element_batch)
|
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
||||||
#
|
super().__init__(pipeline_options)
|
||||||
# yield from element_batch
|
self.pipeline_options: ConvertPipelineOptions
|
||||||
|
|
||||||
|
# ------ Common enrichment models working on all backends
|
||||||
|
|
||||||
|
# Picture description model
|
||||||
|
if (
|
||||||
|
picture_description_model := self._get_picture_description_model(
|
||||||
|
artifacts_path=self.artifacts_path
|
||||||
|
)
|
||||||
|
) is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.enrichment_pipe = [
|
||||||
|
# Document Picture Classifier
|
||||||
|
DocumentPictureClassifier(
|
||||||
|
enabled=pipeline_options.do_picture_classification,
|
||||||
|
artifacts_path=self.artifacts_path,
|
||||||
|
options=DocumentPictureClassifierOptions(),
|
||||||
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
|
),
|
||||||
|
# Document Picture description
|
||||||
|
picture_description_model,
|
||||||
|
]
|
||||||
|
|
||||||
|
def _get_picture_description_model(
|
||||||
|
self, artifacts_path: Optional[Path] = None
|
||||||
|
) -> Optional[PictureDescriptionBaseModel]:
|
||||||
|
factory = get_picture_description_factory(
|
||||||
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||||
|
)
|
||||||
|
return factory.create_instance(
|
||||||
|
options=self.pipeline_options.picture_description_options,
|
||||||
|
enabled=self.pipeline_options.do_picture_description,
|
||||||
|
enable_remote_services=self.pipeline_options.enable_remote_services,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def get_default_options(cls) -> ConvertPipelineOptions:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
|
||||||
def __init__(self, pipeline_options: PipelineOptions):
|
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.keep_backend = False
|
self.keep_backend = False
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
@@ -16,7 +15,10 @@ from docling.datamodel.extraction import (
|
|||||||
ExtractionResult,
|
ExtractionResult,
|
||||||
ExtractionTemplateType,
|
ExtractionTemplateType,
|
||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
PipelineOptions,
|
||||||
|
VlmExtractionPipelineOptions,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.vlm_models_inline.nuextract_transformers_model import (
|
from docling.models.vlm_models_inline.nuextract_transformers_model import (
|
||||||
NuExtractTransformersModel,
|
NuExtractTransformersModel,
|
||||||
@@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
|
|||||||
self.accelerator_options = pipeline_options.accelerator_options
|
self.accelerator_options = pipeline_options.accelerator_options
|
||||||
self.pipeline_options: VlmExtractionPipelineOptions
|
self.pipeline_options: VlmExtractionPipelineOptions
|
||||||
|
|
||||||
artifacts_path: Optional[Path] = None
|
|
||||||
if pipeline_options.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
||||||
elif settings.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
||||||
|
|
||||||
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The value of {artifacts_path=} is not valid. "
|
|
||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create VLM model instance
|
# Create VLM model instance
|
||||||
self.vlm_model = NuExtractTransformersModel(
|
self.vlm_model = NuExtractTransformersModel(
|
||||||
enabled=True,
|
enabled=True,
|
||||||
artifacts_path=artifacts_path, # Will download automatically
|
artifacts_path=self.artifacts_path, # Will download automatically
|
||||||
accelerator_options=self.accelerator_options,
|
accelerator_options=self.accelerator_options,
|
||||||
vlm_options=pipeline_options.vlm_options,
|
vlm_options=pipeline_options.vlm_options,
|
||||||
)
|
)
|
||||||
@@ -203,5 +193,5 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
|
|||||||
raise ValueError(f"Unsupported template type: {type(template)}")
|
raise ValueError(f"Unsupported template type: {type(template)}")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_default_options(cls) -> BaseOptions:
|
def get_default_options(cls) -> PipelineOptions:
|
||||||
return VlmExtractionPipelineOptions()
|
return VlmExtractionPipelineOptions()
|
||||||
|
|||||||
@@ -6,21 +6,21 @@ from docling.backend.abstract_backend import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import ConvertPipelineOptions
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docling.pipeline.base_pipeline import ConvertPipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SimplePipeline(BasePipeline):
|
class SimplePipeline(ConvertPipeline):
|
||||||
"""SimpleModelPipeline.
|
"""SimpleModelPipeline.
|
||||||
|
|
||||||
This class is used at the moment for formats / backends
|
This class is used at the moment for formats / backends
|
||||||
which produce straight DoclingDocument output.
|
which produce straight DoclingDocument output.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pipeline_options: PipelineOptions):
|
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
|
|
||||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
@@ -47,8 +47,8 @@ class SimplePipeline(BasePipeline):
|
|||||||
return ConversionStatus.SUCCESS
|
return ConversionStatus.SUCCESS
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_default_options(cls) -> PipelineOptions:
|
def get_default_options(cls) -> ConvertPipelineOptions:
|
||||||
return PipelineOptions()
|
return ConvertPipelineOptions()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
|
|||||||
@@ -15,18 +15,13 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
||||||
from docling.models.document_picture_classifier import (
|
from docling.models.factories import get_ocr_factory
|
||||||
DocumentPictureClassifier,
|
|
||||||
DocumentPictureClassifierOptions,
|
|
||||||
)
|
|
||||||
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
from docling.models.page_preprocessing_model import (
|
from docling.models.page_preprocessing_model import (
|
||||||
PagePreprocessingModel,
|
PagePreprocessingModel,
|
||||||
PagePreprocessingOptions,
|
PagePreprocessingOptions,
|
||||||
)
|
)
|
||||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
||||||
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
@@ -41,18 +36,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.pipeline_options: PdfPipelineOptions
|
self.pipeline_options: PdfPipelineOptions
|
||||||
|
|
||||||
artifacts_path: Optional[Path] = None
|
|
||||||
if pipeline_options.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
||||||
elif settings.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
||||||
|
|
||||||
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The value of {artifacts_path=} is not valid. "
|
|
||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
||||||
)
|
|
||||||
|
|
||||||
with warnings.catch_warnings(): # deprecated generate_table_images
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
self.keep_images = (
|
self.keep_images = (
|
||||||
@@ -63,7 +46,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
|
||||||
|
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
# Pre-processing
|
# Pre-processing
|
||||||
@@ -76,14 +59,14 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
ocr_model,
|
ocr_model,
|
||||||
# Layout model
|
# Layout model
|
||||||
LayoutModel(
|
LayoutModel(
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
options=pipeline_options.layout_options,
|
options=pipeline_options.layout_options,
|
||||||
),
|
),
|
||||||
# Table structure model
|
# Table structure model
|
||||||
TableStructureModel(
|
TableStructureModel(
|
||||||
enabled=pipeline_options.do_table_structure,
|
enabled=pipeline_options.do_table_structure,
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
options=pipeline_options.table_structure_options,
|
options=pipeline_options.table_structure_options,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
),
|
),
|
||||||
@@ -91,37 +74,19 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
PageAssembleModel(options=PageAssembleOptions()),
|
PageAssembleModel(options=PageAssembleOptions()),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Picture description model
|
|
||||||
if (
|
|
||||||
picture_description_model := self.get_picture_description_model(
|
|
||||||
artifacts_path=artifacts_path
|
|
||||||
)
|
|
||||||
) is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
# Code Formula Enrichment Model
|
# Code Formula Enrichment Model
|
||||||
CodeFormulaModel(
|
CodeFormulaModel(
|
||||||
enabled=pipeline_options.do_code_enrichment
|
enabled=pipeline_options.do_code_enrichment
|
||||||
or pipeline_options.do_formula_enrichment,
|
or pipeline_options.do_formula_enrichment,
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
options=CodeFormulaModelOptions(
|
options=CodeFormulaModelOptions(
|
||||||
do_code_enrichment=pipeline_options.do_code_enrichment,
|
do_code_enrichment=pipeline_options.do_code_enrichment,
|
||||||
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
||||||
),
|
),
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
),
|
),
|
||||||
# Document Picture Classifier
|
*self.enrichment_pipe,
|
||||||
DocumentPictureClassifier(
|
|
||||||
enabled=pipeline_options.do_picture_classification,
|
|
||||||
artifacts_path=artifacts_path,
|
|
||||||
options=DocumentPictureClassifierOptions(),
|
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
|
||||||
),
|
|
||||||
# Document Picture description
|
|
||||||
picture_description_model,
|
|
||||||
]
|
]
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -158,20 +123,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_picture_description_model(
|
|
||||||
self, artifacts_path: Optional[Path] = None
|
|
||||||
) -> Optional[PictureDescriptionBaseModel]:
|
|
||||||
factory = get_picture_description_factory(
|
|
||||||
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
||||||
)
|
|
||||||
return factory.create_instance(
|
|
||||||
options=self.pipeline_options.picture_description_options,
|
|
||||||
enabled=self.pipeline_options.do_picture_description,
|
|
||||||
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
||||||
artifacts_path=artifacts_path,
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
)
|
|
||||||
|
|
||||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
with TimeRecorder(conv_res, "page_init"):
|
with TimeRecorder(conv_res, "page_init"):
|
||||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||||
|
|||||||
@@ -32,21 +32,16 @@ from docling.datamodel.document import ConversionResult
|
|||||||
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
||||||
from docling.models.document_picture_classifier import (
|
from docling.models.factories import get_ocr_factory
|
||||||
DocumentPictureClassifier,
|
|
||||||
DocumentPictureClassifierOptions,
|
|
||||||
)
|
|
||||||
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
from docling.models.page_preprocessing_model import (
|
from docling.models.page_preprocessing_model import (
|
||||||
PagePreprocessingModel,
|
PagePreprocessingModel,
|
||||||
PagePreprocessingOptions,
|
PagePreprocessingOptions,
|
||||||
)
|
)
|
||||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
||||||
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docling.pipeline.base_pipeline import ConvertPipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
from docling.utils.utils import chunkify
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
@@ -294,7 +289,7 @@ class RunContext:
|
|||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
class ThreadedStandardPdfPipeline(BasePipeline):
|
class ThreadedStandardPdfPipeline(ConvertPipeline):
|
||||||
"""High-performance PDF pipeline with multi-threaded stages."""
|
"""High-performance PDF pipeline with multi-threaded stages."""
|
||||||
|
|
||||||
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
||||||
@@ -310,7 +305,7 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|||||||
# ────────────────────────────────────────────────────────────────────────
|
# ────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _init_models(self) -> None:
|
def _init_models(self) -> None:
|
||||||
art_path = self._resolve_artifacts_path()
|
art_path = self.artifacts_path
|
||||||
self.keep_images = (
|
self.keep_images = (
|
||||||
self.pipeline_options.generate_page_images
|
self.pipeline_options.generate_page_images
|
||||||
or self.pipeline_options.generate_picture_images
|
or self.pipeline_options.generate_picture_images
|
||||||
@@ -337,32 +332,20 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|||||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
# --- optional enrichment ------------------------------------------------
|
# --- optional enrichment ------------------------------------------------
|
||||||
self.enrichment_pipe = []
|
self.enrichment_pipe = [
|
||||||
code_formula = CodeFormulaModel(
|
# Code Formula Enrichment Model
|
||||||
|
CodeFormulaModel(
|
||||||
enabled=self.pipeline_options.do_code_enrichment
|
enabled=self.pipeline_options.do_code_enrichment
|
||||||
or self.pipeline_options.do_formula_enrichment,
|
or self.pipeline_options.do_formula_enrichment,
|
||||||
artifacts_path=art_path,
|
artifacts_path=self.artifacts_path,
|
||||||
options=CodeFormulaModelOptions(
|
options=CodeFormulaModelOptions(
|
||||||
do_code_enrichment=self.pipeline_options.do_code_enrichment,
|
do_code_enrichment=self.pipeline_options.do_code_enrichment,
|
||||||
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
|
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
|
||||||
),
|
),
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
)
|
),
|
||||||
if code_formula.enabled:
|
*self.enrichment_pipe,
|
||||||
self.enrichment_pipe.append(code_formula)
|
]
|
||||||
|
|
||||||
picture_classifier = DocumentPictureClassifier(
|
|
||||||
enabled=self.pipeline_options.do_picture_classification,
|
|
||||||
artifacts_path=art_path,
|
|
||||||
options=DocumentPictureClassifierOptions(),
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
)
|
|
||||||
if picture_classifier.enabled:
|
|
||||||
self.enrichment_pipe.append(picture_classifier)
|
|
||||||
|
|
||||||
picture_descr = self._make_picture_description_model(art_path)
|
|
||||||
if picture_descr and picture_descr.enabled:
|
|
||||||
self.enrichment_pipe.append(picture_descr)
|
|
||||||
|
|
||||||
self.keep_backend = any(
|
self.keep_backend = any(
|
||||||
(
|
(
|
||||||
@@ -374,19 +357,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------- helpers
|
# ---------------------------------------------------------------- helpers
|
||||||
def _resolve_artifacts_path(self) -> Optional[Path]:
|
|
||||||
if self.pipeline_options.artifacts_path:
|
|
||||||
p = Path(self.pipeline_options.artifacts_path).expanduser()
|
|
||||||
elif settings.artifacts_path:
|
|
||||||
p = Path(settings.artifacts_path).expanduser()
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
if not p.is_dir():
|
|
||||||
raise RuntimeError(
|
|
||||||
f"{p} does not exist or is not a directory containing the required models"
|
|
||||||
)
|
|
||||||
return p
|
|
||||||
|
|
||||||
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
|
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
|
||||||
factory = get_ocr_factory(
|
factory = get_ocr_factory(
|
||||||
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||||
@@ -398,20 +368,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _make_picture_description_model(
|
|
||||||
self, art_path: Optional[Path]
|
|
||||||
) -> Optional[PictureDescriptionBaseModel]:
|
|
||||||
factory = get_picture_description_factory(
|
|
||||||
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
||||||
)
|
|
||||||
return factory.create_instance(
|
|
||||||
options=self.pipeline_options.picture_description_options,
|
|
||||||
enabled=self.pipeline_options.do_picture_description,
|
|
||||||
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
||||||
artifacts_path=art_path,
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ────────────────────────────────────────────────────────────────────────
|
# ────────────────────────────────────────────────────────────────────────
|
||||||
# Build - thread pipeline
|
# Build - thread pipeline
|
||||||
# ────────────────────────────────────────────────────────────────────────
|
# ────────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -54,18 +54,6 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
self.pipeline_options: VlmPipelineOptions
|
self.pipeline_options: VlmPipelineOptions
|
||||||
|
|
||||||
artifacts_path: Optional[Path] = None
|
|
||||||
if pipeline_options.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
||||||
elif settings.artifacts_path is not None:
|
|
||||||
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
||||||
|
|
||||||
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The value of {artifacts_path=} is not valid. "
|
|
||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
||||||
)
|
|
||||||
|
|
||||||
# force_backend_text = False - use text that is coming from VLM response
|
# force_backend_text = False - use text that is coming from VLM response
|
||||||
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
|
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
|
||||||
self.force_backend_text = (
|
self.force_backend_text = (
|
||||||
@@ -89,7 +77,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
HuggingFaceMlxModel(
|
HuggingFaceMlxModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
vlm_options=vlm_options,
|
vlm_options=vlm_options,
|
||||||
),
|
),
|
||||||
@@ -98,7 +86,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
HuggingFaceTransformersVlmModel(
|
HuggingFaceTransformersVlmModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
vlm_options=vlm_options,
|
vlm_options=vlm_options,
|
||||||
),
|
),
|
||||||
@@ -109,7 +97,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
VllmVlmModel(
|
VllmVlmModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
vlm_options=vlm_options,
|
vlm_options=vlm_options,
|
||||||
),
|
),
|
||||||
|
|||||||
35
docs/examples/enrich_simple_pipeline.py
vendored
Normal file
35
docs/examples/enrich_simple_pipeline.py
vendored
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import ConvertPipelineOptions
|
||||||
|
from docling.document_converter import (
|
||||||
|
DocumentConverter,
|
||||||
|
HTMLFormatOption,
|
||||||
|
WordFormatOption,
|
||||||
|
)
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
input_path = Path("tests/data/docx/word_sample.docx")
|
||||||
|
|
||||||
|
pipeline_options = ConvertPipelineOptions()
|
||||||
|
pipeline_options.do_picture_classification = True
|
||||||
|
pipeline_options.do_picture_description = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
|
||||||
|
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
res = doc_converter.convert(input_path)
|
||||||
|
|
||||||
|
print(res.document.export_to_markdown())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user