diff --git a/README.md b/README.md index 7a6653ef..be37a642 100644 --- a/README.md +++ b/README.md @@ -23,23 +23,25 @@ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling) -Docling parses documents and exports them to the desired format with ease and speed. +Docling simplifies document processing, parsing diverse formats β€” including advanced PDF understanding β€” and providing seamless integrations with the gen AI ecosystem. ## Features -* πŸ—‚οΈ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images) -* πŸ“‘ Advanced PDF document understanding including page layout, reading order & table structures -* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format -* πŸ€– Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI -* πŸ” OCR support for scanned PDFs +* πŸ—‚οΈ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more +* πŸ“‘ Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more +* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format +* β†ͺ️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON +* πŸ”’ Local execution capabilities for sensitive data and air-gapped environments +* πŸ€– Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI +* πŸ” Extensive OCR support for scanned PDFs and images * πŸ’» Simple and convenient CLI -Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling! - ### Coming soon -* ♾️ Equation & code extraction * πŸ“ Metadata extraction, including title, authors, references & language +* πŸ“ Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling)) +* πŸ“ Chart understanding (Barchart, Piechart, LinePlot, etc) +* πŸ“ Complex chemistry understanding (Molecular structures) ## Installation @@ -143,3 +145,7 @@ For individual model usage, please refer to the model licenses found in the orig ## IBM ❀️ Open Source AI Docling has been brought to you by IBM. + +[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/ +[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ +[integrations]: https://ds4sd.github.io/docling/integrations/ diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 89b25ee1..6d22127b 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend): l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT ) else: - padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 366fa6ac..9178883f 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -12,6 +12,7 @@ from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell, Size +from docling.utils.locks import pypdfium2_lock if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -178,24 +179,28 @@ class DoclingParseV2PageBackend(PdfPageBackend): l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT ) else: - padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t - image = ( - self._ppage.render( - scale=scale * 1.5, - rotation=0, # no additional rotation - crop=padbox.as_tuple(), - ) - .to_pil() - .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) - ) # We resize the image from 1.5x the given scale to make it sharper. + with pypdfium2_lock: + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize( + size=(round(cropbox.width * scale), round(cropbox.height * scale)) + ) + ) # We resize the image from 1.5x the given scale to make it sharper. return image def get_size(self) -> Size: - return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) + with pypdfium2_lock: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) def unload(self): self._ppage = None @@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - self._pdoc = pdfium.PdfDocument(self.path_or_stream) - self.parser = pdf_parser_v2("fatal") + with pypdfium2_lock: + self._pdoc = pdfium.PdfDocument(self.path_or_stream) + self.parser = pdf_parser_v2("fatal") - success = False - if isinstance(self.path_or_stream, BytesIO): - success = self.parser.load_document_from_bytesio( - self.document_hash, self.path_or_stream - ) - elif isinstance(self.path_or_stream, Path): - success = self.parser.load_document( - self.document_hash, str(self.path_or_stream) - ) + success = False + if isinstance(self.path_or_stream, BytesIO): + success = self.parser.load_document_from_bytesio( + self.document_hash, self.path_or_stream + ) + elif isinstance(self.path_or_stream, Path): + success = self.parser.load_document( + self.document_hash, str(self.path_or_stream) + ) - if not success: - raise RuntimeError( - f"docling-parse v2 could not load document {self.document_hash}." - ) + if not success: + raise RuntimeError( + f"docling-parse v2 could not load document {self.document_hash}." + ) def page_count(self) -> int: # return len(self._pdoc) # To be replaced with docling-parse API @@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): return len_2 def load_page(self, page_no: int) -> DoclingParseV2PageBackend: - return DoclingParseV2PageBackend( - self.parser, self.document_hash, page_no, self._pdoc[page_no] - ) + with pypdfium2_lock: + return DoclingParseV2PageBackend( + self.parser, self.document_hash, page_no, self._pdoc[page_no] + ) def is_valid(self) -> bool: return self.page_count() > 0 @@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): def unload(self): super().unload() self.parser.unload_document(self.document_hash) - self._pdoc.close() - self._pdoc = None + with pypdfium2_lock: + self._pdoc.close() + self._pdoc = None diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index eeec6bab..c90333df 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,11 +1,20 @@ import logging import os +import re import warnings from enum import Enum from pathlib import Path -from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union +from typing import Annotated, Any, Dict, List, Literal, Optional, Union -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import ( + AnyUrl, + BaseModel, + ConfigDict, + Field, + field_validator, + model_validator, + validator, +) from pydantic_settings import ( BaseSettings, PydanticBaseSettingsSource, @@ -31,7 +40,19 @@ class AcceleratorOptions(BaseSettings): ) num_threads: int = 4 - device: AcceleratorDevice = AcceleratorDevice.AUTO + device: Union[str, AcceleratorDevice] = "auto" + cuda_use_flash_attention2: bool = False + + @field_validator("device") + def validate_device(cls, value): + # "auto", "cpu", "cuda", "mps", or "cuda:N" + if value in {d.value for d in AcceleratorDevice} or re.match( + r"^cuda(:\d+)?$", value + ): + return value + raise ValueError( + "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'." + ) @model_validator(mode="before") @classmethod @@ -47,7 +68,6 @@ class AcceleratorOptions(BaseSettings): """ if isinstance(data, dict): input_num_threads = data.get("num_threads") - # Check if to set the num_threads from the alternative envvar if input_num_threads is None: docling_num_threads = os.getenv("DOCLING_NUM_THREADS") @@ -79,7 +99,7 @@ class TableStructureOptions(BaseModel): # are merged across table columns. # False: Let table structure model define the text cells, ignore PDF cells. ) - mode: TableFormerMode = TableFormerMode.FAST + mode: TableFormerMode = TableFormerMode.ACCURATE class OcrOptions(BaseModel): @@ -125,6 +145,7 @@ class RapidOcrOptions(OcrOptions): det_model_path: Optional[str] = None # same default as rapidocr cls_model_path: Optional[str] = None # same default as rapidocr rec_model_path: Optional[str] = None # same default as rapidocr + rec_keys_path: Optional[str] = None # same default as rapidocr model_config = ConfigDict( extra="forbid", @@ -189,6 +210,90 @@ class OcrMacOptions(OcrOptions): ) +class PictureDescriptionBaseOptions(BaseModel): + kind: str + batch_size: int = 8 + scale: float = 2 + + bitmap_area_threshold: float = ( + 0.2 # percentage of the area for a bitmap to processed with the models + ) + + +class PictureDescriptionApiOptions(PictureDescriptionBaseOptions): + kind: Literal["api"] = "api" + + url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions") + headers: Dict[str, str] = {} + params: Dict[str, Any] = {} + timeout: float = 20 + + prompt: str = "Describe this image in a few sentences." + provenance: str = "" + + +class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): + kind: Literal["vlm"] = "vlm" + + repo_id: str + prompt: str = "Describe this image in a few sentences." + # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig + generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False) + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +smolvlm_picture_description = PictureDescriptionVlmOptions( + repo_id="HuggingFaceTB/SmolVLM-256M-Instruct" +) +# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct") +granite_picture_description = PictureDescriptionVlmOptions( + repo_id="ibm-granite/granite-vision-3.1-2b-preview", + prompt="What is shown in this image?", +) + + +class BaseVlmOptions(BaseModel): + kind: str + prompt: str + + +class ResponseFormat(str, Enum): + DOCTAGS = "doctags" + MARKDOWN = "markdown" + + +class HuggingFaceVlmOptions(BaseVlmOptions): + kind: Literal["hf_model_options"] = "hf_model_options" + + repo_id: str + load_in_8bit: bool = True + llm_int8_threshold: float = 6.0 + quantized: bool = False + + response_format: ResponseFormat + + @property + def repo_cache_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, +) + +granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( + repo_id="ibm-granite/granite-vision-3.1-2b-preview", + # prompt="OCR the full page to markdown.", + prompt="OCR this image.", + response_format=ResponseFormat.MARKDOWN, +) + + # Define an enum for the backend options class PdfBackend(str, Enum): """Enum of valid PDF backends.""" @@ -217,14 +322,40 @@ class PipelineOptions(BaseModel): ) document_timeout: Optional[float] = None accelerator_options: AcceleratorOptions = AcceleratorOptions() + enable_remote_services: bool = False -class PdfPipelineOptions(PipelineOptions): +class PaginatedPipelineOptions(PipelineOptions): + images_scale: float = 1.0 + generate_page_images: bool = False + generate_picture_images: bool = False + + +class VlmPipelineOptions(PaginatedPipelineOptions): + artifacts_path: Optional[Union[Path, str]] = None + + generate_page_images: bool = True + force_backend_text: bool = ( + False # (To be used with vlms, or other generative models) + ) + # If True, text from backend will be used instead of generated text + vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options + + +class PdfPipelineOptions(PaginatedPipelineOptions): """Options for the PDF pipeline.""" artifacts_path: Optional[Union[Path, str]] = None do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text + do_code_enrichment: bool = False # True: perform code OCR + do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code + do_picture_classification: bool = False # True: classify pictures in documents + do_picture_description: bool = False # True: run describe pictures in documents + force_backend_text: bool = ( + False # (To be used with vlms, or other generative models) + ) + # If True, text from backend will be used instead of generated text table_structure_options: TableStructureOptions = TableStructureOptions() ocr_options: Union[ @@ -234,6 +365,10 @@ class PdfPipelineOptions(PipelineOptions): OcrMacOptions, RapidOcrOptions, ] = Field(EasyOcrOptions(), discriminator="kind") + picture_description_options: Annotated[ + Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions], + Field(discriminator="kind"), + ] = smolvlm_picture_description images_scale: float = 1.0 generate_page_images: bool = False