mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Move to pipeline_options.layout_options.model
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
af0461e5b1
17
CHANGELOG.md
17
CHANGELOG.md
@ -1,3 +1,20 @@
|
|||||||
|
## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb))
|
||||||
|
* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e))
|
||||||
|
* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26))
|
||||||
|
|
||||||
|
### Performance
|
||||||
|
|
||||||
|
* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6))
|
||||||
|
* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839))
|
||||||
|
|
||||||
## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
|
## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
timestamps=True,
|
timestamps=True,
|
||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
temperatue=0.0,
|
temperature=0.0,
|
||||||
max_new_tokens=256,
|
max_new_tokens=256,
|
||||||
max_time_chunk=30.0,
|
max_time_chunk=30.0,
|
||||||
)
|
)
|
||||||
@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
timestamps=True,
|
timestamps=True,
|
||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
temperatue=0.0,
|
temperature=0.0,
|
||||||
max_new_tokens=256,
|
max_new_tokens=256,
|
||||||
max_time_chunk=30.0,
|
max_time_chunk=30.0,
|
||||||
)
|
)
|
||||||
@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
timestamps=True,
|
timestamps=True,
|
||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
temperatue=0.0,
|
temperature=0.0,
|
||||||
max_new_tokens=256,
|
max_new_tokens=256,
|
||||||
max_time_chunk=30.0,
|
max_time_chunk=30.0,
|
||||||
)
|
)
|
||||||
@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
timestamps=True,
|
timestamps=True,
|
||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
temperatue=0.0,
|
temperature=0.0,
|
||||||
max_new_tokens=256,
|
max_new_tokens=256,
|
||||||
max_time_chunk=30.0,
|
max_time_chunk=30.0,
|
||||||
)
|
)
|
||||||
@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
timestamps=True,
|
timestamps=True,
|
||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
temperatue=0.0,
|
temperature=0.0,
|
||||||
max_new_tokens=256,
|
max_new_tokens=256,
|
||||||
max_time_chunk=30.0,
|
max_time_chunk=30.0,
|
||||||
)
|
)
|
||||||
@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
|||||||
verbose=True,
|
verbose=True,
|
||||||
timestamps=True,
|
timestamps=True,
|
||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
temperatue=0.0,
|
temperature=0.0,
|
||||||
max_new_tokens=256,
|
max_new_tokens=256,
|
||||||
max_time_chunk=30.0,
|
max_time_chunk=30.0,
|
||||||
)
|
)
|
||||||
|
91
docling/datamodel/layout_model_specs.py
Normal file
91
docling/datamodel/layout_model_specs.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
import logging
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutModelConfig(BaseModel):
|
||||||
|
name: str
|
||||||
|
repo_id: str
|
||||||
|
revision: str
|
||||||
|
model_path: str
|
||||||
|
supported_devices: list[AcceleratorDevice] = [
|
||||||
|
AcceleratorDevice.CPU,
|
||||||
|
AcceleratorDevice.CUDA,
|
||||||
|
AcceleratorDevice.MPS,
|
||||||
|
]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_repo_folder(self) -> str:
|
||||||
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
# HuggingFace Layout Models
|
||||||
|
|
||||||
|
# Default Docling Layout Model
|
||||||
|
DOCLING_LAYOUT_V2 = LayoutModelConfig(
|
||||||
|
name="docling_layout_old",
|
||||||
|
repo_id="ds4sd/docling-layout-old",
|
||||||
|
revision="main",
|
||||||
|
model_path="",
|
||||||
|
)
|
||||||
|
|
||||||
|
DOCLING_LAYOUT_HERON = LayoutModelConfig(
|
||||||
|
name="docling_layout_heron",
|
||||||
|
repo_id="ds4sd/docling-layout-heron",
|
||||||
|
revision="main",
|
||||||
|
model_path="",
|
||||||
|
)
|
||||||
|
|
||||||
|
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
|
||||||
|
name="docling_layout_heron_101",
|
||||||
|
repo_id="ds4sd/docling-layout-heron-101",
|
||||||
|
revision="main",
|
||||||
|
model_path="",
|
||||||
|
)
|
||||||
|
|
||||||
|
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
|
||||||
|
name="docling_layout_egret_medium",
|
||||||
|
repo_id="ds4sd/docling-layout-egret-medium",
|
||||||
|
revision="main",
|
||||||
|
model_path="",
|
||||||
|
)
|
||||||
|
|
||||||
|
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
|
||||||
|
name="docling_layout_egret_large",
|
||||||
|
repo_id="ds4sd/docling-layout-egret-large",
|
||||||
|
revision="main",
|
||||||
|
model_path="",
|
||||||
|
)
|
||||||
|
|
||||||
|
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
|
||||||
|
name="docling_layout_egret_xlarge",
|
||||||
|
repo_id="ds4sd/docling-layout-egret-xlarge",
|
||||||
|
revision="main",
|
||||||
|
model_path="",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example for a hypothetical alternative model
|
||||||
|
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
|
||||||
|
# name="alternative_layout",
|
||||||
|
# repo_id="someorg/alternative-layout",
|
||||||
|
# revision="main",
|
||||||
|
# model_path="model_artifacts/layout_alt",
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutModelType(str, Enum):
|
||||||
|
DOCLING_LAYOUT_V2 = "docling_layout_v2"
|
||||||
|
DOCLING_LAYOUT_OLD = "docling_layout_old"
|
||||||
|
DOCLING_LAYOUT_HERON = "docling_layout_heron"
|
||||||
|
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
|
||||||
|
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
|
||||||
|
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
|
||||||
|
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
|
||||||
|
# ALTERNATIVE_LAYOUT = "alternative_layout"
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
||||||
@ -274,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutOptions(BaseModel):
|
||||||
|
"""Options for layout processing."""
|
||||||
|
|
||||||
|
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
||||||
|
model: LayoutModelConfig = DOCLING_LAYOUT_V2
|
||||||
|
|
||||||
|
|
||||||
class AsrPipelineOptions(PipelineOptions):
|
class AsrPipelineOptions(PipelineOptions):
|
||||||
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
@ -298,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
picture_description_options: PictureDescriptionBaseOptions = (
|
picture_description_options: PictureDescriptionBaseOptions = (
|
||||||
smolvlm_picture_description
|
smolvlm_picture_description
|
||||||
)
|
)
|
||||||
|
layout_options: LayoutOptions = LayoutOptions()
|
||||||
|
|
||||||
images_scale: float = 1.0
|
images_scale: float = 1.0
|
||||||
generate_page_images: bool = False
|
generate_page_images: bool = False
|
||||||
@ -315,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
True # Always True since parsed_page is now mandatory
|
True # Always True since parsed_page is now mandatory
|
||||||
)
|
)
|
||||||
|
|
||||||
layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2
|
|
||||||
|
|
||||||
|
|
||||||
class ProcessingPipeline(str, Enum):
|
class ProcessingPipeline(str, Enum):
|
||||||
STANDARD = "standard"
|
STANDARD = "standard"
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Dict, List, Literal, Optional, Union
|
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
||||||
|
|
||||||
|
from docling_core.types.doc.page import SegmentedPage
|
||||||
from pydantic import AnyUrl, BaseModel
|
from pydantic import AnyUrl, BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|||||||
|
|
||||||
class BaseVlmOptions(BaseModel):
|
class BaseVlmOptions(BaseModel):
|
||||||
kind: str
|
kind: str
|
||||||
prompt: str
|
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
|
||||||
scale: float = 2.0
|
scale: float = 2.0
|
||||||
max_size: Optional[int] = None
|
max_size: Optional[int] = None
|
||||||
|
temperature: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
class ResponseFormat(str, Enum):
|
class ResponseFormat(str, Enum):
|
||||||
@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
|
|||||||
AUTOMODEL = "automodel"
|
AUTOMODEL = "automodel"
|
||||||
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
||||||
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
||||||
|
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
|
||||||
|
|
||||||
|
|
||||||
|
class TransformersPromptStyle(str, Enum):
|
||||||
|
CHAT = "chat"
|
||||||
|
RAW = "raw"
|
||||||
|
|
||||||
|
|
||||||
class InlineVlmOptions(BaseVlmOptions):
|
class InlineVlmOptions(BaseVlmOptions):
|
||||||
@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
|||||||
|
|
||||||
inference_framework: InferenceFramework
|
inference_framework: InferenceFramework
|
||||||
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
||||||
|
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
|
||||||
response_format: ResponseFormat
|
response_format: ResponseFormat
|
||||||
|
|
||||||
torch_dtype: Optional[str] = None
|
torch_dtype: Optional[str] = None
|
||||||
@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
|||||||
AcceleratorDevice.MPS,
|
AcceleratorDevice.MPS,
|
||||||
]
|
]
|
||||||
|
|
||||||
temperature: float = 0.0
|
|
||||||
stop_strings: List[str] = []
|
stop_strings: List[str] = []
|
||||||
extra_generation_config: Dict[str, Any] = {}
|
extra_generation_config: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
|
|||||||
|
|
||||||
self.timeout = self.vlm_options.timeout
|
self.timeout = self.vlm_options.timeout
|
||||||
self.concurrency = self.vlm_options.concurrency
|
self.concurrency = self.vlm_options.concurrency
|
||||||
self.prompt_content = (
|
|
||||||
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
|
||||||
)
|
|
||||||
self.params = {
|
self.params = {
|
||||||
**self.vlm_options.params,
|
**self.vlm_options.params,
|
||||||
"temperature": 0,
|
"temperature": self.vlm_options.temperature,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
|
|||||||
if hi_res_image.mode != "RGB":
|
if hi_res_image.mode != "RGB":
|
||||||
hi_res_image = hi_res_image.convert("RGB")
|
hi_res_image = hi_res_image.convert("RGB")
|
||||||
|
|
||||||
|
if callable(self.vlm_options.prompt):
|
||||||
|
prompt = self.vlm_options.prompt(page.parsed_page)
|
||||||
|
else:
|
||||||
|
prompt = self.vlm_options.prompt
|
||||||
|
|
||||||
page_tags = api_image_request(
|
page_tags = api_image_request(
|
||||||
image=hi_res_image,
|
image=hi_res_image,
|
||||||
prompt=self.prompt_content,
|
prompt=prompt,
|
||||||
url=self.vlm_options.url,
|
url=self.vlm_options.url,
|
||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
headers=self.vlm_options.headers,
|
headers=self.vlm_options.headers,
|
||||||
|
@ -14,7 +14,8 @@ from PIL import Image
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.accelerator_options import AcceleratorOptions
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.models.base_model import BaseEnrichmentModel
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
||||||
|
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
||||||
from docling.models.utils.hf_model_download import download_hf_model
|
from docling.models.utils.hf_model_download import download_hf_model
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
|
||||||
@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
|
|||||||
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
||||||
|
|
||||||
|
|
||||||
class DocumentPictureClassifier(BaseEnrichmentModel):
|
class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
|
||||||
"""
|
"""
|
||||||
A model for classifying pictures in documents.
|
A model for classifying pictures in documents.
|
||||||
|
|
||||||
@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
element_batch: Iterable[NodeItem],
|
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
||||||
) -> Iterable[NodeItem]:
|
) -> Iterable[NodeItem]:
|
||||||
"""
|
"""
|
||||||
Processes a batch of elements and enriches them with classification predictions.
|
Processes a batch of elements and enriches them with classification predictions.
|
||||||
@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|||||||
----------
|
----------
|
||||||
doc : DoclingDocument
|
doc : DoclingDocument
|
||||||
The document containing the elements to be processed.
|
The document containing the elements to be processed.
|
||||||
element_batch : Iterable[NodeItem]
|
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
||||||
A batch of pictures to classify.
|
A batch of pictures to classify.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|||||||
"""
|
"""
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
for element in element_batch:
|
for element in element_batch:
|
||||||
yield element
|
yield element.item
|
||||||
return
|
return
|
||||||
|
|
||||||
images: List[Union[Image.Image, np.ndarray]] = []
|
images: List[Union[Image.Image, np.ndarray]] = []
|
||||||
elements: List[PictureItem] = []
|
elements: List[PictureItem] = []
|
||||||
for el in element_batch:
|
for el in element_batch:
|
||||||
assert isinstance(el, PictureItem)
|
assert isinstance(el.item, PictureItem)
|
||||||
elements.append(el)
|
elements.append(el.item)
|
||||||
img = el.get_image(doc)
|
images.append(el.image)
|
||||||
assert img is not None
|
|
||||||
images.append(img)
|
|
||||||
|
|
||||||
outputs = self.document_picture_classifier.predict(images)
|
outputs = self.document_picture_classifier.predict(images)
|
||||||
|
|
||||||
for element, output in zip(elements, outputs):
|
for item, output in zip(elements, outputs):
|
||||||
element.annotations.append(
|
item.annotations.append(
|
||||||
PictureClassificationData(
|
PictureClassificationData(
|
||||||
provenance="DocumentPictureClassifier",
|
provenance="DocumentPictureClassifier",
|
||||||
predicted_classes=[
|
predicted_classes=[
|
||||||
@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
yield element
|
yield item
|
||||||
|
@ -13,6 +13,7 @@ from docling.datamodel.accelerator_options import AcceleratorOptions
|
|||||||
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
||||||
|
from docling.datamodel.pipeline_options import LayoutOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.models.utils.hf_model_download import download_hf_model
|
from docling.models.utils.hf_model_download import download_hf_model
|
||||||
@ -49,12 +50,14 @@ class LayoutModel(BasePageModel):
|
|||||||
self,
|
self,
|
||||||
artifacts_path: Optional[Path],
|
artifacts_path: Optional[Path],
|
||||||
accelerator_options: AcceleratorOptions,
|
accelerator_options: AcceleratorOptions,
|
||||||
layout_model_config: LayoutModelConfig,
|
options: LayoutOptions,
|
||||||
):
|
):
|
||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
|
|
||||||
|
self.options = options
|
||||||
|
|
||||||
device = decide_device(accelerator_options.device)
|
device = decide_device(accelerator_options.device)
|
||||||
self.layout_model_config = layout_model_config
|
layout_model_config = options.model
|
||||||
model_repo_folder = layout_model_config.model_repo_folder
|
model_repo_folder = layout_model_config.model_repo_folder
|
||||||
model_path = layout_model_config.model_path
|
model_path = layout_model_config.model_path
|
||||||
|
|
||||||
@ -182,7 +185,7 @@ class LayoutModel(BasePageModel):
|
|||||||
# Apply postprocessing
|
# Apply postprocessing
|
||||||
|
|
||||||
processed_clusters, processed_cells = LayoutPostprocessor(
|
processed_clusters, processed_cells = LayoutPostprocessor(
|
||||||
page, clusters
|
page, clusters, self.options
|
||||||
).postprocess()
|
).postprocess()
|
||||||
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
|
|||||||
from docling.datamodel.pipeline_options_vlm_model import (
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
InlineVlmOptions,
|
InlineVlmOptions,
|
||||||
TransformersModelType,
|
TransformersModelType,
|
||||||
|
TransformersPromptStyle,
|
||||||
)
|
)
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.models.utils.hf_model_download import (
|
from docling.models.utils.hf_model_download import (
|
||||||
@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
AutoModel,
|
AutoModel,
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
|
AutoModelForImageTextToText,
|
||||||
AutoModelForVision2Seq,
|
AutoModelForVision2Seq,
|
||||||
AutoProcessor,
|
AutoProcessor,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|||||||
== TransformersModelType.AUTOMODEL_VISION2SEQ
|
== TransformersModelType.AUTOMODEL_VISION2SEQ
|
||||||
):
|
):
|
||||||
model_cls = AutoModelForVision2Seq
|
model_cls = AutoModelForVision2Seq
|
||||||
|
elif (
|
||||||
|
self.vlm_options.transformers_model_type
|
||||||
|
== TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
|
||||||
|
):
|
||||||
|
model_cls = AutoModelForImageTextToText
|
||||||
|
|
||||||
self.processor = AutoProcessor.from_pretrained(
|
self.processor = AutoProcessor.from_pretrained(
|
||||||
artifacts_path,
|
artifacts_path,
|
||||||
@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Define prompt structure
|
# Define prompt structure
|
||||||
prompt = self.formulate_prompt()
|
if callable(self.vlm_options.prompt):
|
||||||
|
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
||||||
|
else:
|
||||||
|
user_prompt = self.vlm_options.prompt
|
||||||
|
prompt = self.formulate_prompt(user_prompt)
|
||||||
|
|
||||||
inputs = self.processor(
|
inputs = self.processor(
|
||||||
text=prompt, images=[hi_res_image], return_tensors="pt"
|
text=prompt, images=[hi_res_image], return_tensors="pt"
|
||||||
@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
def formulate_prompt(self) -> str:
|
def formulate_prompt(self, user_prompt: str) -> str:
|
||||||
"""Formulate a prompt for the VLM."""
|
"""Formulate a prompt for the VLM."""
|
||||||
|
|
||||||
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
|
||||||
|
return user_prompt
|
||||||
|
|
||||||
|
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
||||||
_log.debug("Using specialized prompt for Phi-4")
|
_log.debug("Using specialized prompt for Phi-4")
|
||||||
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
|
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
|
||||||
|
|
||||||
@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|||||||
assistant_prompt = "<|assistant|>"
|
assistant_prompt = "<|assistant|>"
|
||||||
prompt_suffix = "<|end|>"
|
prompt_suffix = "<|end|>"
|
||||||
|
|
||||||
prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
|
prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
|
||||||
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
|
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
|
||||||
|
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
messages = [
|
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
|
||||||
{
|
messages = [
|
||||||
"role": "user",
|
{
|
||||||
"content": [
|
"role": "user",
|
||||||
{
|
"content": [
|
||||||
"type": "text",
|
{
|
||||||
"text": "This is a page from a document.",
|
"type": "text",
|
||||||
},
|
"text": "This is a page from a document.",
|
||||||
{"type": "image"},
|
},
|
||||||
{"type": "text", "text": self.vlm_options.prompt},
|
{"type": "image"},
|
||||||
],
|
{"type": "text", "text": user_prompt},
|
||||||
}
|
],
|
||||||
]
|
}
|
||||||
prompt = self.processor.apply_chat_template(
|
]
|
||||||
messages, add_generation_prompt=False
|
prompt = self.processor.apply_chat_template(
|
||||||
|
messages, add_generation_prompt=False
|
||||||
|
)
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
|
||||||
)
|
)
|
||||||
return prompt
|
|
||||||
|
@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|||||||
elif (artifacts_path / repo_cache_folder).exists():
|
elif (artifacts_path / repo_cache_folder).exists():
|
||||||
artifacts_path = artifacts_path / repo_cache_folder
|
artifacts_path = artifacts_path / repo_cache_folder
|
||||||
|
|
||||||
self.param_question = vlm_options.prompt
|
|
||||||
|
|
||||||
## Load the model
|
## Load the model
|
||||||
self.vlm_model, self.processor = load(artifacts_path)
|
self.vlm_model, self.processor = load(artifacts_path)
|
||||||
self.config = load_config(artifacts_path)
|
self.config = load_config(artifacts_path)
|
||||||
@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|||||||
if hi_res_image.mode != "RGB":
|
if hi_res_image.mode != "RGB":
|
||||||
hi_res_image = hi_res_image.convert("RGB")
|
hi_res_image = hi_res_image.convert("RGB")
|
||||||
|
|
||||||
|
if callable(self.vlm_options.prompt):
|
||||||
|
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
||||||
|
else:
|
||||||
|
user_prompt = self.vlm_options.prompt
|
||||||
prompt = self.apply_chat_template(
|
prompt = self.apply_chat_template(
|
||||||
self.processor, self.config, self.param_question, num_images=1
|
self.processor, self.config, user_prompt, num_images=1
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
@ -81,7 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
LayoutModel(
|
LayoutModel(
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
layout_model_config=pipeline_options.layout_model_config,
|
options=pipeline_options.layout_options,
|
||||||
),
|
),
|
||||||
# Table structure model
|
# Table structure model
|
||||||
TableStructureModel(
|
TableStructureModel(
|
||||||
@ -130,6 +130,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
if (
|
if (
|
||||||
self.pipeline_options.do_formula_enrichment
|
self.pipeline_options.do_formula_enrichment
|
||||||
or self.pipeline_options.do_code_enrichment
|
or self.pipeline_options.do_code_enrichment
|
||||||
|
or self.pipeline_options.do_picture_classification
|
||||||
or self.pipeline_options.do_picture_description
|
or self.pipeline_options.do_picture_description
|
||||||
):
|
):
|
||||||
self.keep_backend = True
|
self.keep_backend = True
|
||||||
|
@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||||
if page._backend is not None and page._backend.is_valid():
|
if page._backend is not None and page._backend.is_valid():
|
||||||
page.size = page._backend.get_size()
|
page.size = page._backend.get_size()
|
||||||
|
page.parsed_page = page._backend.get_segmented_page()
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
|
|||||||
from rtree import index
|
from rtree import index
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, Cluster, Page
|
from docling.datamodel.base_models import BoundingBox, Cluster, Page
|
||||||
|
from docling.datamodel.pipeline_options import LayoutOptions
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -194,12 +195,16 @@ class LayoutPostprocessor:
|
|||||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
|
def __init__(
|
||||||
|
self, page: Page, clusters: List[Cluster], options: LayoutOptions
|
||||||
|
) -> None:
|
||||||
"""Initialize processor with page and clusters."""
|
"""Initialize processor with page and clusters."""
|
||||||
|
|
||||||
self.cells = page.cells
|
self.cells = page.cells
|
||||||
self.page = page
|
self.page = page
|
||||||
self.page_size = page.size
|
self.page_size = page.size
|
||||||
self.all_clusters = clusters
|
self.all_clusters = clusters
|
||||||
|
self.options = options
|
||||||
self.regular_clusters = [
|
self.regular_clusters = [
|
||||||
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
||||||
]
|
]
|
||||||
@ -267,7 +272,7 @@ class LayoutPostprocessor:
|
|||||||
|
|
||||||
# Handle orphaned cells
|
# Handle orphaned cells
|
||||||
unassigned = self._find_unassigned_cells(clusters)
|
unassigned = self._find_unassigned_cells(clusters)
|
||||||
if unassigned:
|
if unassigned and self.options.create_orphan_clusters:
|
||||||
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
||||||
orphan_clusters = []
|
orphan_clusters = []
|
||||||
for i, cell in enumerate(unassigned):
|
for i, cell in enumerate(unassigned):
|
||||||
|
39
docs/examples/compare_vlm_models.py
vendored
39
docs/examples/compare_vlm_models.py
vendored
@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
|||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from docling.datamodel import vlm_model_specs
|
from docling.datamodel import vlm_model_specs
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
|
InferenceFramework,
|
||||||
|
InlineVlmOptions,
|
||||||
|
ResponseFormat,
|
||||||
|
TransformersModelType,
|
||||||
|
TransformersPromptStyle,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
@ -101,6 +108,33 @@ if __name__ == "__main__":
|
|||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
out_path.mkdir(parents=True, exist_ok=True)
|
out_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
## Definiton of more inline models
|
||||||
|
llava_qwen = InlineVlmOptions(
|
||||||
|
repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
|
||||||
|
# prompt="Read text in the image.",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
# prompt="Parse the reading order of this document.",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||||
|
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
|
||||||
|
dolphin_oneshot = InlineVlmOptions(
|
||||||
|
repo_id="ByteDance/Dolphin",
|
||||||
|
prompt="<s>Read text in the image. <Answer/>",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||||
|
transformers_prompt_style=TransformersPromptStyle.RAW,
|
||||||
|
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
## Use VlmPipeline
|
## Use VlmPipeline
|
||||||
pipeline_options = VlmPipelineOptions()
|
pipeline_options = VlmPipelineOptions()
|
||||||
pipeline_options.generate_page_images = True
|
pipeline_options.generate_page_images = True
|
||||||
@ -121,6 +155,9 @@ if __name__ == "__main__":
|
|||||||
vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
|
vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
|
||||||
vlm_model_specs.PHI4_TRANSFORMERS,
|
vlm_model_specs.PHI4_TRANSFORMERS,
|
||||||
vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
|
vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
|
||||||
|
## More inline models
|
||||||
|
dolphin_oneshot,
|
||||||
|
llava_qwen,
|
||||||
]
|
]
|
||||||
|
|
||||||
# Remove MLX models if not on Mac
|
# Remove MLX models if not on Mac
|
||||||
|
71
docs/examples/vlm_pipeline_api_model.py
vendored
71
docs/examples/vlm_pipeline_api_model.py
vendored
@ -1,8 +1,10 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from docling_core.types.doc.page import SegmentedPage
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
|
|||||||
return options
|
return options
|
||||||
|
|
||||||
|
|
||||||
|
#### Using LM Studio with OlmOcr model
|
||||||
|
|
||||||
|
|
||||||
|
def lms_olmocr_vlm_options(model: str):
|
||||||
|
def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
|
||||||
|
if page is None:
|
||||||
|
return (
|
||||||
|
"Below is the image of one page of a document. Just return the plain text"
|
||||||
|
" representation of this document as if you were reading it naturally.\n"
|
||||||
|
"Do not hallucinate.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
anchor = [
|
||||||
|
f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
|
||||||
|
]
|
||||||
|
|
||||||
|
for text_cell in page.textline_cells:
|
||||||
|
if not text_cell.text.strip():
|
||||||
|
continue
|
||||||
|
bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
|
||||||
|
page.dimension.height
|
||||||
|
)
|
||||||
|
anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
|
||||||
|
|
||||||
|
for image_cell in page.bitmap_resources:
|
||||||
|
bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
|
||||||
|
page.dimension.height
|
||||||
|
)
|
||||||
|
anchor.append(
|
||||||
|
f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(anchor) == 1:
|
||||||
|
anchor.append(
|
||||||
|
f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Original prompt uses cells sorting. We are skipping it in this demo.
|
||||||
|
|
||||||
|
base_text = "\n".join(anchor)
|
||||||
|
|
||||||
|
return (
|
||||||
|
f"Below is the image of one page of a document, as well as some raw textual"
|
||||||
|
f" content that was previously extracted for it. Just return the plain text"
|
||||||
|
f" representation of this document as if you were reading it naturally.\n"
|
||||||
|
f"Do not hallucinate.\n"
|
||||||
|
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||||
|
)
|
||||||
|
|
||||||
|
options = ApiVlmOptions(
|
||||||
|
url="http://localhost:1234/v1/chat/completions",
|
||||||
|
params=dict(
|
||||||
|
model=model,
|
||||||
|
),
|
||||||
|
prompt=_dynamic_olmocr_prompt,
|
||||||
|
timeout=90,
|
||||||
|
scale=1.0,
|
||||||
|
max_size=1024, # from OlmOcr pipeline
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
)
|
||||||
|
return options
|
||||||
|
|
||||||
|
|
||||||
#### Using Ollama
|
#### Using Ollama
|
||||||
|
|
||||||
|
|
||||||
@ -123,6 +188,12 @@ def main():
|
|||||||
# format=ResponseFormat.MARKDOWN,
|
# format=ResponseFormat.MARKDOWN,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
|
# Example using the OlmOcr (dynamic prompt) model with LM Studio:
|
||||||
|
# (uncomment the following lines)
|
||||||
|
# pipeline_options.vlm_options = lms_olmocr_vlm_options(
|
||||||
|
# model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
|
||||||
|
# )
|
||||||
|
|
||||||
# Example using the Granite Vision model with Ollama:
|
# Example using the Granite Vision model with Ollama:
|
||||||
# (uncomment the following lines)
|
# (uncomment the following lines)
|
||||||
# pipeline_options.vlm_options = ollama_vlm_options(
|
# pipeline_options.vlm_options = ollama_vlm_options(
|
||||||
|
2
docs/installation/index.md
vendored
2
docs/installation/index.md
vendored
@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
|||||||
=== "RHEL"
|
=== "RHEL"
|
||||||
|
|
||||||
```console
|
```console
|
||||||
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel
|
||||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||||
```
|
```
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.39.0" # DO NOT EDIT, updated automatically
|
version = "2.40.0" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
keywords = [
|
keywords = [
|
||||||
|
@ -17,8 +17,9 @@ def get_converter():
|
|||||||
pipeline_options.do_table_structure = False
|
pipeline_options.do_table_structure = False
|
||||||
pipeline_options.do_code_enrichment = False
|
pipeline_options.do_code_enrichment = False
|
||||||
pipeline_options.do_formula_enrichment = False
|
pipeline_options.do_formula_enrichment = False
|
||||||
|
pipeline_options.generate_picture_images = False
|
||||||
|
pipeline_options.generate_page_images = False
|
||||||
pipeline_options.do_picture_classification = True
|
pipeline_options.do_picture_classification = True
|
||||||
pipeline_options.generate_picture_images = True
|
|
||||||
pipeline_options.images_scale = 2
|
pipeline_options.images_scale = 2
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
|
2
uv.lock
generated
2
uv.lock
generated
@ -805,7 +805,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.39.0"
|
version = "2.40.0"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "beautifulsoup4" },
|
{ name = "beautifulsoup4" },
|
||||||
|
Loading…
Reference in New Issue
Block a user