diff --git a/CHANGELOG.md b/CHANGELOG.md index ffd1bc13..acddae23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,21 @@ +## [v2.28.0](https://github.com/docling-project/docling/releases/tag/v2.28.0) - 2025-03-19 + +### Feature + +* **SmolDocling:** Support MLX acceleration in VLM pipeline ([#1199](https://github.com/docling-project/docling/issues/1199)) ([`1c26769`](https://github.com/docling-project/docling/commit/1c26769785bcd17c0b8b621c5182ad81134d3915)) +* Add PPTX notes slides ([#474](https://github.com/docling-project/docling/issues/474)) ([`b454aa1`](https://github.com/docling-project/docling/commit/b454aa1551b891644ce4028ed2d7ec8f82c167ab)) +* Updated vlm pipeline (with latest changes from docling-core) ([#1158](https://github.com/docling-project/docling/issues/1158)) ([`2f72167`](https://github.com/docling-project/docling/commit/2f72167ff6421424dea4d93018b0d43af16ec153)) + +### Fix + +* Determine correct page size in DoclingParseV4Backend ([#1196](https://github.com/docling-project/docling/issues/1196)) ([`f5adfb9`](https://github.com/docling-project/docling/commit/f5adfb9724aae1207f23e21d74033f331e6e1ffb)) +* **msword:** Fixing function return in equations handling ([#1194](https://github.com/docling-project/docling/issues/1194)) ([`0b707d0`](https://github.com/docling-project/docling/commit/0b707d0882f5be42505871799387d0b1882bffbf)) + +### Documentation + +* Linux Foundation AI & Data ([#1183](https://github.com/docling-project/docling/issues/1183)) ([`1d680b0`](https://github.com/docling-project/docling/commit/1d680b0a321d95fc6bd65b7bb4d5e15005a0250a)) +* Move apify to docs ([#1182](https://github.com/docling-project/docling/issues/1182)) ([`54a78c3`](https://github.com/docling-project/docling/commit/54a78c307de833b93f9b84cf1f8ed6dace8573cb)) + ## [v2.27.0](https://github.com/docling-project/docling/releases/tag/v2.27.0) - 2025-03-18 ### Feature diff --git a/README.md b/README.md index 208de0d0..19048b7a 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images -* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) +* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕 * 💻 Simple and convenient CLI ### Coming soon @@ -57,7 +57,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl ## Getting started -To convert individual documents, use `convert()`, for example: +To convert individual documents with python, use `convert()`, for example: ```python from docling.document_converter import DocumentConverter @@ -71,6 +71,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in the docs. +## CLI + +Docling has a built-in CLI to run conversions. + +```bash +docling https://arxiv.org/pdf/2206.01062 +``` + +You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI: +```bash +docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062 +``` +This will use MLX acceleration on supported Apple Silicon hardware. + +Read more [here](https://docling-project.github.io/docling/usage/) + ## Documentation Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 231d6224..a752e8dc 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -16,6 +16,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer from PIL import Image, UnidentifiedImageError from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER @@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB for shape in slide.shapes: handle_shapes(shape, parent_slide, slide_ind, doc, slide_size) + # Handle notes slide + if slide.has_notes_slide: + notes_slide = slide.notes_slide + notes_text = notes_slide.notes_text_frame.text.strip() + if notes_text: + bbox = BoundingBox(l=0, t=0, r=0, b=0) + prov = ProvenanceItem( + page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox + ) + doc.add_text( + label=DocItemLabel.TEXT, + parent=parent_slide, + text=notes_text, + prov=prov, + content_layer=ContentLayer.FURNITURE, + ) + return doc diff --git a/docling/cli/main.py b/docling/cli/main.py index 7f0f20bf..c85a04f3 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import ( AcceleratorOptions, EasyOcrOptions, OcrOptions, + PaginatedPipelineOptions, PdfBackend, + PdfPipeline, PdfPipelineOptions, TableFormerMode, + VlmModelType, + VlmPipelineOptions, + granite_vision_vlm_conversion_options, + smoldocling_vlm_conversion_options, + smoldocling_vlm_mlx_conversion_options, ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory +from docling.pipeline.vlm_pipeline import VlmPipeline warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") @@ -200,6 +208,14 @@ def convert( help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.", ), ] = ImageRefMode.EMBEDDED, + pipeline: Annotated[ + PdfPipeline, + typer.Option(..., help="Choose the pipeline to process PDF or image files."), + ] = PdfPipeline.STANDARD, + vlm_model: Annotated[ + VlmModelType, + typer.Option(..., help="Choose the VLM model to use with PDF or image files."), + ] = VlmModelType.SMOLDOCLING, ocr: Annotated[ bool, typer.Option( @@ -420,50 +436,77 @@ def convert( ocr_options.lang = ocr_lang_list accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) - pipeline_options = PdfPipelineOptions( - allow_external_plugins=allow_external_plugins, - enable_remote_services=enable_remote_services, - accelerator_options=accelerator_options, - do_ocr=ocr, - ocr_options=ocr_options, - do_table_structure=True, - do_code_enrichment=enrich_code, - do_formula_enrichment=enrich_formula, - do_picture_description=enrich_picture_description, - do_picture_classification=enrich_picture_classes, - document_timeout=document_timeout, - ) - pipeline_options.table_structure_options.do_cell_matching = ( - True # do_cell_matching - ) - pipeline_options.table_structure_options.mode = table_mode + pipeline_options: PaginatedPipelineOptions - if image_export_mode != ImageRefMode.PLACEHOLDER: - pipeline_options.generate_page_images = True - pipeline_options.generate_picture_images = ( - True # FIXME: to be deprecated in verson 3 + if pipeline == PdfPipeline.STANDARD: + pipeline_options = PdfPipelineOptions( + allow_external_plugins=allow_external_plugins, + enable_remote_services=enable_remote_services, + accelerator_options=accelerator_options, + do_ocr=ocr, + ocr_options=ocr_options, + do_table_structure=True, + do_code_enrichment=enrich_code, + do_formula_enrichment=enrich_formula, + do_picture_description=enrich_picture_description, + do_picture_classification=enrich_picture_classes, + document_timeout=document_timeout, + ) + pipeline_options.table_structure_options.do_cell_matching = ( + True # do_cell_matching + ) + pipeline_options.table_structure_options.mode = table_mode + + if image_export_mode != ImageRefMode.PLACEHOLDER: + pipeline_options.generate_page_images = True + pipeline_options.generate_picture_images = ( + True # FIXME: to be deprecated in verson 3 + ) + pipeline_options.images_scale = 2 + + backend: Type[PdfDocumentBackend] + if pdf_backend == PdfBackend.DLPARSE_V1: + backend = DoclingParseDocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V2: + backend = DoclingParseV2DocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V4: + backend = DoclingParseV4DocumentBackend # type: ignore + elif pdf_backend == PdfBackend.PYPDFIUM2: + backend = PyPdfiumDocumentBackend # type: ignore + else: + raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") + + pdf_format_option = PdfFormatOption( + pipeline_options=pipeline_options, + backend=backend, # pdf_backend + ) + elif pipeline == PdfPipeline.VLM: + pipeline_options = VlmPipelineOptions() + + if vlm_model == VlmModelType.GRANITE_VISION: + pipeline_options.vlm_options = granite_vision_vlm_conversion_options + elif vlm_model == VlmModelType.SMOLDOCLING: + pipeline_options.vlm_options = smoldocling_vlm_conversion_options + if sys.platform == "darwin": + try: + import mlx_vlm + + pipeline_options.vlm_options = ( + smoldocling_vlm_mlx_conversion_options + ) + except ImportError: + _log.warning( + "To run SmolDocling faster, please install mlx-vlm:\n" + "pip install mlx-vlm" + ) + + pdf_format_option = PdfFormatOption( + pipeline_cls=VlmPipeline, pipeline_options=pipeline_options ) - pipeline_options.images_scale = 2 if artifacts_path is not None: pipeline_options.artifacts_path = artifacts_path - backend: Type[PdfDocumentBackend] - if pdf_backend == PdfBackend.DLPARSE_V1: - backend = DoclingParseDocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V2: - backend = DoclingParseV2DocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V4: - backend = DoclingParseV4DocumentBackend # type: ignore - elif pdf_backend == PdfBackend.PYPDFIUM2: - backend = PyPdfiumDocumentBackend # type: ignore - else: - raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") - - pdf_format_option = PdfFormatOption( - pipeline_options=pipeline_options, - backend=backend, # pdf_backend - ) format_options: Dict[InputFormat, FormatOption] = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d28b5826..654e04df 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -263,6 +263,11 @@ class ResponseFormat(str, Enum): MARKDOWN = "markdown" +class InferenceFramework(str, Enum): + MLX = "mlx" + TRANSFORMERS = "transformers" + + class HuggingFaceVlmOptions(BaseVlmOptions): kind: Literal["hf_model_options"] = "hf_model_options" @@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions): llm_int8_threshold: float = 6.0 quantized: bool = False + inference_framework: InferenceFramework response_format: ResponseFormat @property @@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions): return self.repo_id.replace("/", "--") +smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions( + repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16", + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.MLX, +) + + smoldocling_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="ds4sd/SmolDocling-256M-preview", prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, + inference_framework=InferenceFramework.TRANSFORMERS, ) granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( @@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions( # prompt="OCR the full page to markdown.", prompt="OCR this image.", response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, ) +class VlmModelType(str, Enum): + SMOLDOCLING = "smoldocling" + GRANITE_VISION = "granite_vision" + + # Define an enum for the backend options class PdfBackend(str, Enum): """Enum of valid PDF backends.""" @@ -327,13 +348,14 @@ class PipelineOptions(BaseModel): class PaginatedPipelineOptions(PipelineOptions): + artifacts_path: Optional[Union[Path, str]] = None + images_scale: float = 1.0 generate_page_images: bool = False generate_picture_images: bool = False class VlmPipelineOptions(PaginatedPipelineOptions): - artifacts_path: Optional[Union[Path, str]] = None generate_page_images: bool = True force_backend_text: bool = ( @@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions): class PdfPipelineOptions(PaginatedPipelineOptions): """Options for the PDF pipeline.""" - artifacts_path: Optional[Union[Path, str]] = None do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_code_enrichment: bool = False # True: perform code OCR @@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions): ) generate_parsed_pages: bool = False + + +class PdfPipeline(str, Enum): + STANDARD = "standard" + VLM = "vlm" diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py new file mode 100644 index 00000000..762a6557 --- /dev/null +++ b/docling/models/hf_mlx_model.py @@ -0,0 +1,137 @@ +import logging +import time +from pathlib import Path +from typing import Iterable, List, Optional + +from docling.datamodel.base_models import Page, VlmPrediction +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + HuggingFaceVlmOptions, +) +from docling.datamodel.settings import settings +from docling.models.base_model import BasePageModel +from docling.utils.accelerator_utils import decide_device +from docling.utils.profiling import TimeRecorder + +_log = logging.getLogger(__name__) + + +class HuggingFaceMlxModel(BasePageModel): + + def __init__( + self, + enabled: bool, + artifacts_path: Optional[Path], + accelerator_options: AcceleratorOptions, + vlm_options: HuggingFaceVlmOptions, + ): + self.enabled = enabled + + self.vlm_options = vlm_options + + if self.enabled: + + try: + from mlx_vlm import generate, load # type: ignore + from mlx_vlm.prompt_utils import apply_chat_template # type: ignore + from mlx_vlm.utils import load_config, stream_generate # type: ignore + except ImportError: + raise ImportError( + "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models." + ) + + repo_cache_folder = vlm_options.repo_id.replace("/", "--") + self.apply_chat_template = apply_chat_template + self.stream_generate = stream_generate + + # PARAMETERS: + if artifacts_path is None: + artifacts_path = self.download_models(self.vlm_options.repo_id) + elif (artifacts_path / repo_cache_folder).exists(): + artifacts_path = artifacts_path / repo_cache_folder + + self.param_question = vlm_options.prompt # "Perform Layout Analysis." + + ## Load the model + self.vlm_model, self.processor = load(artifacts_path) + self.config = load_config(artifacts_path) + + @staticmethod + def download_models( + repo_id: str, + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id=repo_id, + force_download=force, + local_dir=local_dir, + # revision="v0.0.1", + ) + + return Path(download_path) + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + for page in page_batch: + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "vlm"): + assert page.size is not None + + hi_res_image = page.get_image(scale=2.0) # 144dpi + # hi_res_image = page.get_image(scale=1.0) # 72dpi + + if hi_res_image is not None: + im_width, im_height = hi_res_image.size + + # populate page_tags with predicted doc tags + page_tags = "" + + if hi_res_image: + if hi_res_image.mode != "RGB": + hi_res_image = hi_res_image.convert("RGB") + + prompt = self.apply_chat_template( + self.processor, self.config, self.param_question, num_images=1 + ) + + start_time = time.time() + # Call model to generate: + output = "" + for token in self.stream_generate( + self.vlm_model, + self.processor, + prompt, + [hi_res_image], + max_tokens=4096, + verbose=False, + ): + output += token.text + if "" in token.text: + break + + generation_time = time.time() - start_time + page_tags = output + + # inference_time = time.time() - start_time + # tokens_per_second = num_tokens / generation_time + # print("") + # print(f"Page Inference Time: {inference_time:.2f} seconds") + # print(f"Total tokens on page: {num_tokens:.2f}") + # print(f"Tokens/sec: {tokens_per_second:.2f}") + # print("") + page.predictions.vlm_response = VlmPrediction(text=page_tags) + + yield page diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 4afb918d..d4defa89 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions +from docling.datamodel.pipeline_options import ( + InferenceFramework, + ResponseFormat, + VlmPipelineOptions, +) from docling.datamodel.settings import settings +from docling.models.hf_mlx_model import HuggingFaceMlxModel from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -29,12 +34,6 @@ class VlmPipeline(PaginatedPipeline): super().__init__(pipeline_options) self.keep_backend = True - warnings.warn( - "The VlmPipeline is currently experimental and may change in upcoming versions without notice.", - category=UserWarning, - stacklevel=2, - ) - self.pipeline_options: VlmPipelineOptions artifacts_path: Optional[Path] = None @@ -58,14 +57,27 @@ class VlmPipeline(PaginatedPipeline): self.keep_images = self.pipeline_options.generate_page_images - self.build_pipe = [ - HuggingFaceVlmModel( - enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, - accelerator_options=pipeline_options.accelerator_options, - vlm_options=self.pipeline_options.vlm_options, - ), - ] + if ( + self.pipeline_options.vlm_options.inference_framework + == InferenceFramework.MLX + ): + self.build_pipe = [ + HuggingFaceMlxModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=self.pipeline_options.vlm_options, + ), + ] + else: + self.build_pipe = [ + HuggingFaceVlmModel( + enabled=True, # must be always enabled for this pipeline to make sense. + artifacts_path=artifacts_path, + accelerator_options=pipeline_options.accelerator_options, + vlm_options=self.pipeline_options.vlm_options, + ), + ] self.enrichment_pipe = [ # Other models working on `NodeItem` elements in the DoclingDocument @@ -79,7 +91,9 @@ class VlmPipeline(PaginatedPipeline): return page - def extract_text_from_backend(self, page: Page, bbox: BoundingBox | None) -> str: + def extract_text_from_backend( + self, page: Page, bbox: Union[BoundingBox, None] + ) -> str: # Convert bounding box normalized to 0-100 into page coordinates for cropping text = "" if bbox: diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 948ecc64..6a15fe42 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -10,13 +10,15 @@ from docling.datamodel.pipeline_options import ( VlmPipelineOptions, granite_vision_vlm_conversion_options, smoldocling_vlm_conversion_options, + smoldocling_vlm_mlx_conversion_options, ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline sources = [ - "tests/data/2305.03393v1-pg9-img.png", + # "tests/data/2305.03393v1-pg9-img.png", + "tests/data/pdf/2305.03393v1-pg9.pdf", ] ## Use experimental VlmPipeline @@ -29,7 +31,10 @@ pipeline_options.force_backend_text = False # pipeline_options.accelerator_options.cuda_use_flash_attention2 = True ## Pick a VLM model. We choose SmolDocling-256M by default -pipeline_options.vlm_options = smoldocling_vlm_conversion_options +# pipeline_options.vlm_options = smoldocling_vlm_conversion_options + +## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX +pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options ## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options @@ -63,9 +68,6 @@ for source in sources: res = converter.convert(source) - print("------------------------------------------------") - print("MD:") - print("------------------------------------------------") print("") print(res.document.export_to_markdown()) @@ -83,8 +85,17 @@ for source in sources: with (out_path / f"{res.input.file.stem}.json").open("w") as fp: fp.write(json.dumps(res.document.export_to_dict())) - pg_num = res.document.num_pages() + res.document.save_as_json( + out_path / f"{res.input.file.stem}.md", + image_mode=ImageRefMode.PLACEHOLDER, + ) + res.document.save_as_markdown( + out_path / f"{res.input.file.stem}.md", + image_mode=ImageRefMode.PLACEHOLDER, + ) + + pg_num = res.document.num_pages() print("") inference_time = time.time() - start_time print( diff --git a/docs/index.md b/docs/index.md index 789dae8f..acc9933f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -26,7 +26,7 @@ Docling simplifies document processing, parsing diverse formats — including ad * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images -* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) +* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕🔥 * 💻 Simple and convenient CLI ### Coming soon diff --git a/docs/usage/index.md b/docs/usage/index.md index 1ab7842c..acf33976 100644 --- a/docs/usage/index.md +++ b/docs/usage/index.md @@ -17,10 +17,15 @@ print(result.document.export_to_markdown()) # output: "### Docling Technical Re You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories. -A simple example would look like this: ```console docling https://arxiv.org/pdf/2206.01062 ``` +You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI: +```bash +docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062 +``` +This will use MLX acceleration on supported Apple Silicon hardware. + To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](../reference/cli.md). diff --git a/pyproject.toml b/pyproject.toml index 0f85915f..8ed5a558 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.27.0" # DO NOT EDIT, updated automatically +version = "2.28.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = [ "Christoph Auer ", @@ -192,6 +192,7 @@ module = [ "docling_ibm_models.*", "easyocr.*", "ocrmac.*", + "mlx_vlm.*", "lxml.*", "huggingface_hub.*", "transformers.*", diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json index b24c46ed..fb441563 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json @@ -4,7 +4,7 @@ "name": "powerpoint_sample", "origin": { "mimetype": "application/vnd.ms-powerpoint", - "binary_hash": 1640759611026400292, + "binary_hash": 15572290240354948364, "filename": "powerpoint_sample.pptx" }, "furniture": { @@ -75,6 +75,9 @@ }, { "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" } ], "content_layer": "body", @@ -94,19 +97,22 @@ "$ref": "#/groups/4" }, { - "$ref": "#/texts/15" + "$ref": "#/texts/16" }, { "$ref": "#/groups/5" }, { - "$ref": "#/texts/18" + "$ref": "#/texts/19" }, { "$ref": "#/groups/6" }, { "$ref": "#/groups/7" + }, + { + "$ref": "#/texts/26" } ], "content_layer": "body", @@ -119,14 +125,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/8" - }, { "$ref": "#/texts/9" }, { "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" } ], "content_layer": "body", @@ -139,9 +145,6 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/11" - }, { "$ref": "#/texts/12" }, @@ -150,6 +153,9 @@ }, { "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" } ], "content_layer": "body", @@ -163,10 +169,10 @@ }, "children": [ { - "$ref": "#/texts/16" + "$ref": "#/texts/17" }, { - "$ref": "#/texts/17" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -179,14 +185,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/19" - }, { "$ref": "#/texts/20" }, { "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" } ], "content_layer": "body", @@ -199,14 +205,14 @@ "$ref": "#/groups/2" }, "children": [ - { - "$ref": "#/texts/22" - }, { "$ref": "#/texts/23" }, { "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" } ], "content_layer": "body", @@ -433,6 +439,33 @@ }, { "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 31 + ] + } + ], + "orig": "Some notes on the second slide.", + "text": "Some notes on the second slide." + }, + { + "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/3" }, @@ -461,7 +494,7 @@ "marker": "1." }, { - "self_ref": "#/texts/9", + "self_ref": "#/texts/10", "parent": { "$ref": "#/groups/3" }, @@ -490,7 +523,7 @@ "marker": "2." }, { - "self_ref": "#/texts/10", + "self_ref": "#/texts/11", "parent": { "$ref": "#/groups/3" }, @@ -519,7 +552,7 @@ "marker": "3." }, { - "self_ref": "#/texts/11", + "self_ref": "#/texts/12", "parent": { "$ref": "#/groups/4" }, @@ -548,7 +581,7 @@ "marker": "-" }, { - "self_ref": "#/texts/12", + "self_ref": "#/texts/13", "parent": { "$ref": "#/groups/4" }, @@ -577,7 +610,7 @@ "marker": "-" }, { - "self_ref": "#/texts/13", + "self_ref": "#/texts/14", "parent": { "$ref": "#/groups/4" }, @@ -606,7 +639,7 @@ "marker": "-" }, { - "self_ref": "#/texts/14", + "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/4" }, @@ -635,7 +668,7 @@ "marker": "-" }, { - "self_ref": "#/texts/15", + "self_ref": "#/texts/16", "parent": { "$ref": "#/groups/2" }, @@ -662,7 +695,7 @@ "text": "Some info:" }, { - "self_ref": "#/texts/16", + "self_ref": "#/texts/17", "parent": { "$ref": "#/groups/5" }, @@ -691,7 +724,7 @@ "marker": "-" }, { - "self_ref": "#/texts/17", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -720,7 +753,7 @@ "marker": "-" }, { - "self_ref": "#/texts/18", + "self_ref": "#/texts/19", "parent": { "$ref": "#/groups/2" }, @@ -747,7 +780,7 @@ "text": "Maybe a list?" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/20", "parent": { "$ref": "#/groups/6" }, @@ -776,7 +809,7 @@ "marker": "1." }, { - "self_ref": "#/texts/20", + "self_ref": "#/texts/21", "parent": { "$ref": "#/groups/6" }, @@ -805,7 +838,7 @@ "marker": "2." }, { - "self_ref": "#/texts/21", + "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/6" }, @@ -834,7 +867,7 @@ "marker": "3." }, { - "self_ref": "#/texts/22", + "self_ref": "#/texts/23", "parent": { "$ref": "#/groups/7" }, @@ -863,7 +896,7 @@ "marker": "-" }, { - "self_ref": "#/texts/23", + "self_ref": "#/texts/24", "parent": { "$ref": "#/groups/7" }, @@ -892,7 +925,7 @@ "marker": "-" }, { - "self_ref": "#/texts/24", + "self_ref": "#/texts/25", "parent": { "$ref": "#/groups/7" }, @@ -919,6 +952,33 @@ "text": "l3", "enumerated": false, "marker": "-" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 0.0, + "b": 0.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 53 + ] + } + ], + "orig": "Final notes on the third slide.\nSecond line of notes.", + "text": "Final notes on the third slide.\nSecond line of notes." } ], "pictures": [], diff --git a/tests/data/pptx/powerpoint_sample.pptx b/tests/data/pptx/powerpoint_sample.pptx index acabf415..0818f283 100644 Binary files a/tests/data/pptx/powerpoint_sample.pptx and b/tests/data/pptx/powerpoint_sample.pptx differ