Actor: Resolving conflicts with main (pass 2)

Signed-off-by: Václav Vančura <commit@vancura.dev>
This commit is contained in:
Václav Vančura 2025-03-13 11:02:08 +01:00
parent d7b306231e
commit ebd323a5e8
No known key found for this signature in database
GPG Key ID: 33AF230AE454F1FF
4 changed files with 196 additions and 47 deletions

View File

@ -23,23 +23,25 @@
[![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
[![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling) [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
Docling parses documents and exports them to the desired format with ease and speed. Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
## Features ## Features
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images) * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
* 📑 Advanced PDF document understanding including page layout, reading order & table structures * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
* 🔍 OCR support for scanned PDFs * 🔒 Local execution capabilities for sensitive data and air-gapped environments
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
* 🔍 Extensive OCR support for scanned PDFs and images
* 💻 Simple and convenient CLI * 💻 Simple and convenient CLI
Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
### Coming soon ### Coming soon
* ♾️ Equation & code extraction
* 📝 Metadata extraction, including title, authors, references & language * 📝 Metadata extraction, including title, authors, references & language
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures)
## Installation ## Installation
@ -143,3 +145,7 @@ For individual model usage, please refer to the model licenses found in the orig
## IBM ❤️ Open Source AI ## IBM ❤️ Open Source AI
Docling has been brought to you by IBM. Docling has been brought to you by IBM.
[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
[integrations]: https://ds4sd.github.io/docling/integrations/

View File

@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
) )
else: else:
padbox = cropbox.to_bottom_left_origin(page_size.height) padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
padbox.r = page_size.width - padbox.r padbox.r = page_size.width - padbox.r
padbox.t = page_size.height - padbox.t padbox.t = page_size.height - padbox.t

View File

@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size from docling.datamodel.base_models import Cell, Size
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -178,24 +179,28 @@ class DoclingParseV2PageBackend(PdfPageBackend):
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
) )
else: else:
padbox = cropbox.to_bottom_left_origin(page_size.height) padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
padbox.r = page_size.width - padbox.r padbox.r = page_size.width - padbox.r
padbox.t = page_size.height - padbox.t padbox.t = page_size.height - padbox.t
image = ( with pypdfium2_lock:
self._ppage.render( image = (
scale=scale * 1.5, self._ppage.render(
rotation=0, # no additional rotation scale=scale * 1.5,
crop=padbox.as_tuple(), rotation=0, # no additional rotation
) crop=padbox.as_tuple(),
.to_pil() )
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) .to_pil()
) # We resize the image from 1.5x the given scale to make it sharper. .resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)
) # We resize the image from 1.5x the given scale to make it sharper.
return image return image
def get_size(self) -> Size: def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) with pypdfium2_lock:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self): def unload(self):
self._ppage = None self._ppage = None
@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream) with pypdfium2_lock:
self.parser = pdf_parser_v2("fatal") self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser_v2("fatal")
success = False success = False
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio( success = self.parser.load_document_from_bytesio(
self.document_hash, self.path_or_stream self.document_hash, self.path_or_stream
) )
elif isinstance(self.path_or_stream, Path): elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document( success = self.parser.load_document(
self.document_hash, str(self.path_or_stream) self.document_hash, str(self.path_or_stream)
) )
if not success: if not success:
raise RuntimeError( raise RuntimeError(
f"docling-parse v2 could not load document {self.document_hash}." f"docling-parse v2 could not load document {self.document_hash}."
) )
def page_count(self) -> int: def page_count(self) -> int:
# return len(self._pdoc) # To be replaced with docling-parse API # return len(self._pdoc) # To be replaced with docling-parse API
@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
return len_2 return len_2
def load_page(self, page_no: int) -> DoclingParseV2PageBackend: def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
return DoclingParseV2PageBackend( with pypdfium2_lock:
self.parser, self.document_hash, page_no, self._pdoc[page_no] return DoclingParseV2PageBackend(
) self.parser, self.document_hash, page_no, self._pdoc[page_no]
)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.page_count() > 0 return self.page_count() > 0
@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
def unload(self): def unload(self):
super().unload() super().unload()
self.parser.unload_document(self.document_hash) self.parser.unload_document(self.document_hash)
self._pdoc.close() with pypdfium2_lock:
self._pdoc = None self._pdoc.close()
self._pdoc = None

View File

@ -1,11 +1,20 @@
import logging import logging
import os import os
import re
import warnings import warnings
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union from typing import Annotated, Any, Dict, List, Literal, Optional, Union
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pydantic import (
AnyUrl,
BaseModel,
ConfigDict,
Field,
field_validator,
model_validator,
validator,
)
from pydantic_settings import ( from pydantic_settings import (
BaseSettings, BaseSettings,
PydanticBaseSettingsSource, PydanticBaseSettingsSource,
@ -31,7 +40,19 @@ class AcceleratorOptions(BaseSettings):
) )
num_threads: int = 4 num_threads: int = 4
device: AcceleratorDevice = AcceleratorDevice.AUTO device: Union[str, AcceleratorDevice] = "auto"
cuda_use_flash_attention2: bool = False
@field_validator("device")
def validate_device(cls, value):
# "auto", "cpu", "cuda", "mps", or "cuda:N"
if value in {d.value for d in AcceleratorDevice} or re.match(
r"^cuda(:\d+)?$", value
):
return value
raise ValueError(
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
)
@model_validator(mode="before") @model_validator(mode="before")
@classmethod @classmethod
@ -47,7 +68,6 @@ class AcceleratorOptions(BaseSettings):
""" """
if isinstance(data, dict): if isinstance(data, dict):
input_num_threads = data.get("num_threads") input_num_threads = data.get("num_threads")
# Check if to set the num_threads from the alternative envvar # Check if to set the num_threads from the alternative envvar
if input_num_threads is None: if input_num_threads is None:
docling_num_threads = os.getenv("DOCLING_NUM_THREADS") docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
@ -79,7 +99,7 @@ class TableStructureOptions(BaseModel):
# are merged across table columns. # are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells. # False: Let table structure model define the text cells, ignore PDF cells.
) )
mode: TableFormerMode = TableFormerMode.FAST mode: TableFormerMode = TableFormerMode.ACCURATE
class OcrOptions(BaseModel): class OcrOptions(BaseModel):
@ -125,6 +145,7 @@ class RapidOcrOptions(OcrOptions):
det_model_path: Optional[str] = None # same default as rapidocr det_model_path: Optional[str] = None # same default as rapidocr
cls_model_path: Optional[str] = None # same default as rapidocr cls_model_path: Optional[str] = None # same default as rapidocr
rec_model_path: Optional[str] = None # same default as rapidocr rec_model_path: Optional[str] = None # same default as rapidocr
rec_keys_path: Optional[str] = None # same default as rapidocr
model_config = ConfigDict( model_config = ConfigDict(
extra="forbid", extra="forbid",
@ -189,6 +210,90 @@ class OcrMacOptions(OcrOptions):
) )
class PictureDescriptionBaseOptions(BaseModel):
kind: str
batch_size: int = 8
scale: float = 2
bitmap_area_threshold: float = (
0.2 # percentage of the area for a bitmap to processed with the models
)
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
kind: Literal["api"] = "api"
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
timeout: float = 20
prompt: str = "Describe this image in a few sentences."
provenance: str = ""
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: Literal["vlm"] = "vlm"
repo_id: str
prompt: str = "Describe this image in a few sentences."
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
smolvlm_picture_description = PictureDescriptionVlmOptions(
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
)
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
granite_picture_description = PictureDescriptionVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
prompt="What is shown in this image?",
)
class BaseVlmOptions(BaseModel):
kind: str
prompt: str
class ResponseFormat(str, Enum):
DOCTAGS = "doctags"
MARKDOWN = "markdown"
class HuggingFaceVlmOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options"
repo_id: str
load_in_8bit: bool = True
llm_int8_threshold: float = 6.0
quantized: bool = False
response_format: ResponseFormat
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="ds4sd/SmolDocling-256M-preview",
prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS,
)
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
# prompt="OCR the full page to markdown.",
prompt="OCR this image.",
response_format=ResponseFormat.MARKDOWN,
)
# Define an enum for the backend options # Define an enum for the backend options
class PdfBackend(str, Enum): class PdfBackend(str, Enum):
"""Enum of valid PDF backends.""" """Enum of valid PDF backends."""
@ -217,14 +322,40 @@ class PipelineOptions(BaseModel):
) )
document_timeout: Optional[float] = None document_timeout: Optional[float] = None
accelerator_options: AcceleratorOptions = AcceleratorOptions() accelerator_options: AcceleratorOptions = AcceleratorOptions()
enable_remote_services: bool = False
class PdfPipelineOptions(PipelineOptions): class PaginatedPipelineOptions(PipelineOptions):
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
class VlmPipelineOptions(PaginatedPipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
generate_page_images: bool = True
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline.""" """Options for the PDF pipeline."""
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[ ocr_options: Union[
@ -234,6 +365,10 @@ class PdfPipelineOptions(PipelineOptions):
OcrMacOptions, OcrMacOptions,
RapidOcrOptions, RapidOcrOptions,
] = Field(EasyOcrOptions(), discriminator="kind") ] = Field(EasyOcrOptions(), discriminator="kind")
picture_description_options: Annotated[
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
Field(discriminator="kind"),
] = smolvlm_picture_description
images_scale: float = 1.0 images_scale: float = 1.0
generate_page_images: bool = False generate_page_images: bool = False