docling/usage/index.html

3691 lines
82 KiB
HTML
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="canonical" href="https://docling-project.github.io/docling/usage/">
<link rel="prev" href="../installation/">
<link rel="next" href="supported_formats/">
<link rel="icon" href="../assets/logo.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.15">
<title>Usage - Docling</title>
<link rel="stylesheet" href="../assets/stylesheets/main.342714a4.min.css">
<link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<link rel="stylesheet" href="../assets/_mkdocstrings.css">
<link rel="stylesheet" href="../stylesheets/extra.css">
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#conversion" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href=".." title="Docling" class="md-header__button md-logo" aria-label="Docling" data-md-component="logo">
<img src="../assets/logo.png" alt="logo">
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
Docling
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Usage
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="(prefers-color-scheme)" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Switch to light mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m14.3 16-.7-2h-3.2l-.7 2H7.8L11 7h2l3.2 9zM20 8.69V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12zm-9.15 3.96h2.3L12 9z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_2" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to system preference" type="radio" name="__palette" id="__palette_2">
<label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
<div class="md-search__suggest" data-md-component="search-suggest"></div>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/docling-project/docling" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
</div>
<div class="md-source__repository">
docling-project/docling
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
<div class="md-grid">
<ul class="md-tabs__list">
<li class="md-tabs__item md-tabs__item--active">
<a href=".." class="md-tabs__link">
Home
</a>
</li>
<li class="md-tabs__item">
<a href="../concepts/" class="md-tabs__link">
Concepts
</a>
</li>
<li class="md-tabs__item">
<a href="../examples/" class="md-tabs__link">
Examples
</a>
</li>
<li class="md-tabs__item">
<a href="../integrations/" class="md-tabs__link">
Integrations
</a>
</li>
<li class="md-tabs__item">
<a href="../reference/document_converter/" class="md-tabs__link">
Reference
</a>
</li>
</ul>
</div>
</nav>
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href=".." title="Docling" class="md-nav__button md-logo" aria-label="Docling" data-md-component="logo">
<img src="../assets/logo.png" alt="logo">
</a>
Docling
</label>
<div class="md-nav__source">
<a href="https://github.com/docling-project/docling" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
</div>
<div class="md-source__repository">
docling-project/docling
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1" checked>
<div class="md-nav__link md-nav__container">
<a href=".." class="md-nav__link ">
<span class="md-ellipsis">
Home
</span>
</a>
<label class="md-nav__link " for="__nav_1" id="__nav_1_label" tabindex="">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_1_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_1">
<span class="md-nav__icon md-icon"></span>
Home
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1_2" >
<div class="md-nav__link md-nav__container">
<a href="../installation/" class="md-nav__link ">
<span class="md-ellipsis">
Installation
</span>
</a>
</div>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_1_2">
<span class="md-nav__icon md-icon"></span>
Installation
</label>
<ul class="md-nav__list" data-md-scrollfix>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1_3" checked>
<div class="md-nav__link md-nav__container">
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Usage
</span>
</a>
<label class="md-nav__link md-nav__link--active" for="__nav_1_3" id="__nav_1_3_label" tabindex="">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_1_3">
<span class="md-nav__icon md-icon"></span>
Usage
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="supported_formats/" class="md-nav__link">
<span class="md-ellipsis">
Supported formats
</span>
</a>
</li>
<li class="md-nav__item">
<a href="enrichments/" class="md-nav__link">
<span class="md-ellipsis">
Enrichment features
</span>
</a>
</li>
<li class="md-nav__item">
<a href="vision_models/" class="md-nav__link">
<span class="md-ellipsis">
Vision models
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1_4" >
<div class="md-nav__link md-nav__container">
<a href="../faq/" class="md-nav__link ">
<span class="md-ellipsis">
FAQ
</span>
</a>
</div>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_1_4">
<span class="md-nav__icon md-icon"></span>
FAQ
</label>
<ul class="md-nav__list" data-md-scrollfix>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<div class="md-nav__link md-nav__container">
<a href="../concepts/" class="md-nav__link ">
<span class="md-ellipsis">
Concepts
</span>
</a>
<label class="md-nav__link " for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Concepts
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../concepts/architecture/" class="md-nav__link">
<span class="md-ellipsis">
Architecture
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../concepts/docling_document/" class="md-nav__link">
<span class="md-ellipsis">
Docling Document
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../concepts/serialization/" class="md-nav__link">
<span class="md-ellipsis">
Serialization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../concepts/confidence_scores/" class="md-nav__link">
<span class="md-ellipsis">
Confidence Scores
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../concepts/chunking/" class="md-nav__link">
<span class="md-ellipsis">
Chunking
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../concepts/plugins/" class="md-nav__link">
<span class="md-ellipsis">
Plugins
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<div class="md-nav__link md-nav__container">
<a href="../examples/" class="md-nav__link ">
<span class="md-ellipsis">
Examples
</span>
</a>
<label class="md-nav__link " for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Examples
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_2" >
<label class="md-nav__link" for="__nav_3_2" id="__nav_3_2_label" tabindex="0">
<span class="md-ellipsis">
🔀 Conversion
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3_2">
<span class="md-nav__icon md-icon"></span>
🔀 Conversion
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../examples/minimal/" class="md-nav__link">
<span class="md-ellipsis">
Simple conversion
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/custom_convert/" class="md-nav__link">
<span class="md-ellipsis">
Custom conversion
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/batch_convert/" class="md-nav__link">
<span class="md-ellipsis">
Batch conversion
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/run_with_formats/" class="md-nav__link">
<span class="md-ellipsis">
Multi-format conversion
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/minimal_vlm_pipeline/" class="md-nav__link">
<span class="md-ellipsis">
VLM pipeline with SmolDocling
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/vlm_pipeline_api_model/" class="md-nav__link">
<span class="md-ellipsis">
VLM pipeline with remote model
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/compare_vlm_models/" class="md-nav__link">
<span class="md-ellipsis">
Compare VLM models
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/minimal_asr_pipeline/" class="md-nav__link">
<span class="md-ellipsis">
ASR pipeline with Whisper
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/export_figures/" class="md-nav__link">
<span class="md-ellipsis">
Figure export
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/export_tables/" class="md-nav__link">
<span class="md-ellipsis">
Table export
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/export_multimodal/" class="md-nav__link">
<span class="md-ellipsis">
Multimodal export
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/full_page_ocr/" class="md-nav__link">
<span class="md-ellipsis">
Force full page OCR
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/tesseract_lang_detection/" class="md-nav__link">
<span class="md-ellipsis">
Automatic OCR language detection with tesseract
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/rapidocr_with_custom_models/" class="md-nav__link">
<span class="md-ellipsis">
RapidOCR with custom OCR models
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/run_with_accelerator/" class="md-nav__link">
<span class="md-ellipsis">
Accelerator options
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/translate/" class="md-nav__link">
<span class="md-ellipsis">
Simple translation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/backend_csv/" class="md-nav__link">
<span class="md-ellipsis">
Conversion of CSV files
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/backend_xml_rag/" class="md-nav__link">
<span class="md-ellipsis">
Conversion of custom XML
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_3" >
<label class="md-nav__link" for="__nav_3_3" id="__nav_3_3_label" tabindex="0">
<span class="md-ellipsis">
✂️ Serialization & chunking
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3_3">
<span class="md-nav__icon md-icon"></span>
✂️ Serialization & chunking
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../examples/serialization/" class="md-nav__link">
<span class="md-ellipsis">
Serialization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/hybrid_chunking/" class="md-nav__link">
<span class="md-ellipsis">
Hybrid chunking
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/advanced_chunking_and_serialization/" class="md-nav__link">
<span class="md-ellipsis">
Advanced chunking &amp; serialization
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_4" >
<label class="md-nav__link" for="__nav_3_4" id="__nav_3_4_label" tabindex="0">
<span class="md-ellipsis">
🤖 RAG with AI dev frameworks
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3_4">
<span class="md-nav__icon md-icon"></span>
🤖 RAG with AI dev frameworks
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../examples/rag_haystack/" class="md-nav__link">
<span class="md-ellipsis">
RAG with Haystack
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/rag_langchain/" class="md-nav__link">
<span class="md-ellipsis">
RAG with LangChain
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/rag_llamaindex/" class="md-nav__link">
<span class="md-ellipsis">
RAG with LlamaIndex
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/visual_grounding/" class="md-nav__link">
<span class="md-ellipsis">
Visual grounding
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_5" >
<label class="md-nav__link" for="__nav_3_5" id="__nav_3_5_label" tabindex="0">
<span class="md-ellipsis">
🖼️ Picture annotation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3_5">
<span class="md-nav__icon md-icon"></span>
🖼️ Picture annotation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../examples/pictures_description/" class="md-nav__link">
<span class="md-ellipsis">
Annotate picture with local VLM
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/pictures_description_api/" class="md-nav__link">
<span class="md-ellipsis">
Annotate picture with remote VLM
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_6" >
<label class="md-nav__link" for="__nav_3_6" id="__nav_3_6_label" tabindex="0">
<span class="md-ellipsis">
✨ Enrichment development
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3_6">
<span class="md-nav__icon md-icon"></span>
✨ Enrichment development
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../examples/develop_picture_enrichment/" class="md-nav__link">
<span class="md-ellipsis">
Figure enrichment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/develop_formula_understanding/" class="md-nav__link">
<span class="md-ellipsis">
Formula enrichment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/enrich_doclingdocument/" class="md-nav__link">
<span class="md-ellipsis">
Enrich DoclingDocument
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_7" >
<label class="md-nav__link" for="__nav_3_7" id="__nav_3_7_label" tabindex="0">
<span class="md-ellipsis">
🗂️ More examples
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_7_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3_7">
<span class="md-nav__icon md-icon"></span>
🗂️ More examples
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../examples/rag_milvus/" class="md-nav__link">
<span class="md-ellipsis">
RAG with Milvus
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/rag_weaviate/" class="md-nav__link">
<span class="md-ellipsis">
RAG with Weaviate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb" class="md-nav__link">
<span class="md-ellipsis">
RAG with Granite [↗]
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/rag_azuresearch/" class="md-nav__link">
<span class="md-ellipsis">
RAG with Azure AI Search
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../examples/retrieval_qdrant/" class="md-nav__link">
<span class="md-ellipsis">
Retrieval with Qdrant
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<div class="md-nav__link md-nav__container">
<a href="../integrations/" class="md-nav__link ">
<span class="md-ellipsis">
Integrations
</span>
</a>
<label class="md-nav__link " for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-nav__icon md-icon"></span>
</label>
</div>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Integrations
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4_2" >
<label class="md-nav__link" for="__nav_4_2" id="__nav_4_2_label" tabindex="0">
<span class="md-ellipsis">
🤖 Agentic / AI dev frameworks
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_4_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4_2">
<span class="md-nav__icon md-icon"></span>
🤖 Agentic / AI dev frameworks
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../integrations/bee/" class="md-nav__link">
<span class="md-ellipsis">
Bee Agent Framework
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/crewai/" class="md-nav__link">
<span class="md-ellipsis">
Crew AI
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/haystack/" class="md-nav__link">
<span class="md-ellipsis">
Haystack
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/langchain/" class="md-nav__link">
<span class="md-ellipsis">
LangChain
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/llamaindex/" class="md-nav__link">
<span class="md-ellipsis">
LlamaIndex
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/txtai/" class="md-nav__link">
<span class="md-ellipsis">
txtai
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4_3" >
<label class="md-nav__link" for="__nav_4_3" id="__nav_4_3_label" tabindex="0">
<span class="md-ellipsis">
⭐️ Featured
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_4_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4_3">
<span class="md-nav__icon md-icon"></span>
⭐️ Featured
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../integrations/apify/" class="md-nav__link">
<span class="md-ellipsis">
Apify
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/data_prep_kit/" class="md-nav__link">
<span class="md-ellipsis">
Data Prep Kit
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/instructlab/" class="md-nav__link">
<span class="md-ellipsis">
InstructLab
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/nvidia/" class="md-nav__link">
<span class="md-ellipsis">
NVIDIA
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/prodigy/" class="md-nav__link">
<span class="md-ellipsis">
Prodigy
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/rhel_ai/" class="md-nav__link">
<span class="md-ellipsis">
RHEL AI
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/spacy/" class="md-nav__link">
<span class="md-ellipsis">
spaCy
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4_4" >
<label class="md-nav__link" for="__nav_4_4" id="__nav_4_4_label" tabindex="0">
<span class="md-ellipsis">
🗂️ More integrations
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_4_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4_4">
<span class="md-nav__icon md-icon"></span>
🗂️ More integrations
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../integrations/cloudera/" class="md-nav__link">
<span class="md-ellipsis">
Cloudera
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/docetl/" class="md-nav__link">
<span class="md-ellipsis">
DocETL
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/kotaemon/" class="md-nav__link">
<span class="md-ellipsis">
Kotaemon
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/opencontracts/" class="md-nav__link">
<span class="md-ellipsis">
OpenContracts
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/openwebui/" class="md-nav__link">
<span class="md-ellipsis">
Open WebUI
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../integrations/vectara/" class="md-nav__link">
<span class="md-ellipsis">
Vectara
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
Reference
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Reference
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5_1" >
<label class="md-nav__link" for="__nav_5_1" id="__nav_5_1_label" tabindex="0">
<span class="md-ellipsis">
Python API
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_5_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5_1">
<span class="md-nav__icon md-icon"></span>
Python API
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../reference/document_converter/" class="md-nav__link">
<span class="md-ellipsis">
Document Converter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../reference/pipeline_options/" class="md-nav__link">
<span class="md-ellipsis">
Pipeline options
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../reference/docling_document/" class="md-nav__link">
<span class="md-ellipsis">
Docling Document
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5_2" >
<label class="md-nav__link" for="__nav_5_2" id="__nav_5_2_label" tabindex="0">
<span class="md-ellipsis">
CLI
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_5_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5_2">
<span class="md-nav__icon md-icon"></span>
CLI
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../reference/cli/" class="md-nav__link">
<span class="md-ellipsis">
CLI reference
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#conversion" class="md-nav__link">
<span class="md-ellipsis">
Conversion
</span>
</a>
<nav class="md-nav" aria-label="Conversion">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#convert-a-single-document" class="md-nav__link">
<span class="md-ellipsis">
Convert a single document
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#cli" class="md-nav__link">
<span class="md-ellipsis">
CLI
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#advanced-options" class="md-nav__link">
<span class="md-ellipsis">
Advanced options
</span>
</a>
<nav class="md-nav" aria-label="Advanced options">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#model-prefetching-and-offline-usage" class="md-nav__link">
<span class="md-ellipsis">
Model prefetching and offline usage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#using-remote-services" class="md-nav__link">
<span class="md-ellipsis">
Using remote services
</span>
</a>
<nav class="md-nav" aria-label="Using remote services">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#list-of-remote-model-services" class="md-nav__link">
<span class="md-ellipsis">
List of remote model services
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#adjust-pipeline-features" class="md-nav__link">
<span class="md-ellipsis">
Adjust pipeline features
</span>
</a>
<nav class="md-nav" aria-label="Adjust pipeline features">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#control-pdf-table-extraction-options" class="md-nav__link">
<span class="md-ellipsis">
Control PDF table extraction options
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#impose-limits-on-the-document-size" class="md-nav__link">
<span class="md-ellipsis">
Impose limits on the document size
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#convert-from-binary-pdf-streams" class="md-nav__link">
<span class="md-ellipsis">
Convert from binary PDF streams
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#limit-resource-usage" class="md-nav__link">
<span class="md-ellipsis">
Limit resource usage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#use-specific-backend-converters" class="md-nav__link">
<span class="md-ellipsis">
Use specific backend converters
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#chunking" class="md-nav__link">
<span class="md-ellipsis">
Chunking
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1>Usage</h1>
<h2 id="conversion">Conversion</h2>
<h3 id="convert-a-single-document">Convert a single document</h3>
<p>To convert individual PDF documents, use <code>convert()</code>, for example:</p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span>
<span class="n">source</span> <span class="o">=</span> <span class="s2">&quot;https://arxiv.org/pdf/2408.09869&quot;</span> <span class="c1"># PDF path or URL</span>
<span class="n">converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">()</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">source</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">export_to_markdown</span><span class="p">())</span> <span class="c1"># output: &quot;### Docling Technical Report[...]&quot;</span>
</code></pre></div>
<h3 id="cli">CLI</h3>
<p>You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.</p>
<p><div class="highlight"><pre><span></span><code><span class="go">docling https://arxiv.org/pdf/2206.01062</span>
</code></pre></div>
You can also use 🥚<a href="https://huggingface.co/ds4sd/SmolDocling-256M-preview">SmolDocling</a> and other VLMs via Docling CLI:
<div class="highlight"><pre><span></span><code>docling<span class="w"> </span>--pipeline<span class="w"> </span>vlm<span class="w"> </span>--vlm-model<span class="w"> </span>smoldocling<span class="w"> </span>https://arxiv.org/pdf/2206.01062
</code></pre></div>
This will use MLX acceleration on supported Apple Silicon hardware.</p>
<p>To see all available options (export formats etc.) run <code>docling --help</code>. More details in the <a href="../reference/cli/">CLI reference page</a>.</p>
<h3 id="advanced-options">Advanced options</h3>
<h4 id="model-prefetching-and-offline-usage">Model prefetching and offline usage</h4>
<p>By default, models are downloaded automatically upon first usage. If you would prefer
to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
that as follows:</p>
<p><strong>Step 1: Prefetch the models</strong></p>
<p>Use the <code>docling-tools models download</code> utility:</p>
<div class="highlight"><pre><span></span><code>$<span class="w"> </span>docling-tools<span class="w"> </span>models<span class="w"> </span>download
Downloading<span class="w"> </span>layout<span class="w"> </span>model...
Downloading<span class="w"> </span>tableformer<span class="w"> </span>model...
Downloading<span class="w"> </span>picture<span class="w"> </span>classifier<span class="w"> </span>model...
Downloading<span class="w"> </span>code<span class="w"> </span>formula<span class="w"> </span>model...
Downloading<span class="w"> </span>easyocr<span class="w"> </span>models...
Models<span class="w"> </span>downloaded<span class="w"> </span>into<span class="w"> </span><span class="nv">$HOME</span>/.cache/docling/models.
</code></pre></div>
<p>Alternatively, models can be programmatically downloaded using <code>docling.utils.model_downloader.download_models()</code>.</p>
<p><strong>Step 2: Use the prefetched models</strong></p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.base_models</span><span class="w"> </span><span class="kn">import</span> <span class="n">InputFormat</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.pipeline_options</span><span class="w"> </span><span class="kn">import</span> <span class="n">EasyOcrOptions</span><span class="p">,</span> <span class="n">PdfPipelineOptions</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span><span class="p">,</span> <span class="n">PdfFormatOption</span>
<span class="n">artifacts_path</span> <span class="o">=</span> <span class="s2">&quot;/local/path/to/models&quot;</span>
<span class="n">pipeline_options</span> <span class="o">=</span> <span class="n">PdfPipelineOptions</span><span class="p">(</span><span class="n">artifacts_path</span><span class="o">=</span><span class="n">artifacts_path</span><span class="p">)</span>
<span class="n">doc_converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">(</span>
<span class="n">format_options</span><span class="o">=</span><span class="p">{</span>
<span class="n">InputFormat</span><span class="o">.</span><span class="n">PDF</span><span class="p">:</span> <span class="n">PdfFormatOption</span><span class="p">(</span><span class="n">pipeline_options</span><span class="o">=</span><span class="n">pipeline_options</span><span class="p">)</span>
<span class="p">}</span>
<span class="p">)</span>
</code></pre></div>
<p>Or using the CLI:</p>
<div class="highlight"><pre><span></span><code>docling<span class="w"> </span>--artifacts-path<span class="o">=</span><span class="s2">&quot;/local/path/to/models&quot;</span><span class="w"> </span>FILE
</code></pre></div>
<p>Or using the <code>DOCLING_ARTIFACTS_PATH</code> environment variable:</p>
<div class="highlight"><pre><span></span><code><span class="nb">export</span><span class="w"> </span><span class="nv">DOCLING_ARTIFACTS_PATH</span><span class="o">=</span><span class="s2">&quot;/local/path/to/models&quot;</span>
python<span class="w"> </span>my_docling_script.py
</code></pre></div>
<h4 id="using-remote-services">Using remote services</h4>
<p>The main purpose of Docling is to run local models which are not sharing any user data with remote services.
Anyhow, there are valid use cases for processing part of the pipeline using remote services, for example invoking OCR engines from cloud vendors or the usage of hosted LLMs.</p>
<p>In Docling we decided to allow such models, but we require the user to explicitly opt-in in communicating with external services.</p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.base_models</span><span class="w"> </span><span class="kn">import</span> <span class="n">InputFormat</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.pipeline_options</span><span class="w"> </span><span class="kn">import</span> <span class="n">PdfPipelineOptions</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span><span class="p">,</span> <span class="n">PdfFormatOption</span>
<span class="n">pipeline_options</span> <span class="o">=</span> <span class="n">PdfPipelineOptions</span><span class="p">(</span><span class="n">enable_remote_services</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">doc_converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">(</span>
<span class="n">format_options</span><span class="o">=</span><span class="p">{</span>
<span class="n">InputFormat</span><span class="o">.</span><span class="n">PDF</span><span class="p">:</span> <span class="n">PdfFormatOption</span><span class="p">(</span><span class="n">pipeline_options</span><span class="o">=</span><span class="n">pipeline_options</span><span class="p">)</span>
<span class="p">}</span>
<span class="p">)</span>
</code></pre></div>
<p>When the value <code>enable_remote_services=True</code> is not set, the system will raise an exception <code>OperationNotAllowed()</code>.</p>
<p><em>Note: This option is only related to the system sending user data to remote services. Control of pulling data (e.g. model weights) follows the logic described in <a href="#model-prefetching-and-offline-usage">Model prefetching and offline usage</a>.</em></p>
<h5 id="list-of-remote-model-services">List of remote model services</h5>
<p>The options in this list require the explicit <code>enable_remote_services=True</code> when processing the documents.</p>
<ul>
<li><code>PictureDescriptionApiOptions</code>: Using vision models via API calls.</li>
</ul>
<h4 id="adjust-pipeline-features">Adjust pipeline features</h4>
<p>The example file <a href="../examples/custom_convert/">custom_convert.py</a> contains multiple ways
one can adjust the conversion pipeline and features.</p>
<h5 id="control-pdf-table-extraction-options">Control PDF table extraction options</h5>
<p>You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.</p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.base_models</span><span class="w"> </span><span class="kn">import</span> <span class="n">InputFormat</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span><span class="p">,</span> <span class="n">PdfFormatOption</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.pipeline_options</span><span class="w"> </span><span class="kn">import</span> <span class="n">PdfPipelineOptions</span>
<span class="n">pipeline_options</span> <span class="o">=</span> <span class="n">PdfPipelineOptions</span><span class="p">(</span><span class="n">do_table_structure</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">pipeline_options</span><span class="o">.</span><span class="n">table_structure_options</span><span class="o">.</span><span class="n">do_cell_matching</span> <span class="o">=</span> <span class="kc">False</span> <span class="c1"># uses text cells predicted from table structure model</span>
<span class="n">doc_converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">(</span>
<span class="n">format_options</span><span class="o">=</span><span class="p">{</span>
<span class="n">InputFormat</span><span class="o">.</span><span class="n">PDF</span><span class="p">:</span> <span class="n">PdfFormatOption</span><span class="p">(</span><span class="n">pipeline_options</span><span class="o">=</span><span class="n">pipeline_options</span><span class="p">)</span>
<span class="p">}</span>
<span class="p">)</span>
</code></pre></div>
<p>Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between <code>TableFormerMode.FAST</code> (faster but less accurate) and <code>TableFormerMode.ACCURATE</code> (default) to receive better quality with difficult table structures.</p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.base_models</span><span class="w"> </span><span class="kn">import</span> <span class="n">InputFormat</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span><span class="p">,</span> <span class="n">PdfFormatOption</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.pipeline_options</span><span class="w"> </span><span class="kn">import</span> <span class="n">PdfPipelineOptions</span><span class="p">,</span> <span class="n">TableFormerMode</span>
<span class="n">pipeline_options</span> <span class="o">=</span> <span class="n">PdfPipelineOptions</span><span class="p">(</span><span class="n">do_table_structure</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">pipeline_options</span><span class="o">.</span><span class="n">table_structure_options</span><span class="o">.</span><span class="n">mode</span> <span class="o">=</span> <span class="n">TableFormerMode</span><span class="o">.</span><span class="n">ACCURATE</span> <span class="c1"># use more accurate TableFormer model</span>
<span class="n">doc_converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">(</span>
<span class="n">format_options</span><span class="o">=</span><span class="p">{</span>
<span class="n">InputFormat</span><span class="o">.</span><span class="n">PDF</span><span class="p">:</span> <span class="n">PdfFormatOption</span><span class="p">(</span><span class="n">pipeline_options</span><span class="o">=</span><span class="n">pipeline_options</span><span class="p">)</span>
<span class="p">}</span>
<span class="p">)</span>
</code></pre></div>
<h4 id="impose-limits-on-the-document-size">Impose limits on the document size</h4>
<p>You can limit the file size and number of pages which should be allowed to process per document:</p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span>
<span class="n">source</span> <span class="o">=</span> <span class="s2">&quot;https://arxiv.org/pdf/2408.09869&quot;</span>
<span class="n">converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">()</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">max_num_pages</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">max_file_size</span><span class="o">=</span><span class="mi">20971520</span><span class="p">)</span>
</code></pre></div>
<h4 id="convert-from-binary-pdf-streams">Convert from binary PDF streams</h4>
<p>You can convert PDFs from a binary stream instead of from the filesystem as follows:</p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">io</span><span class="w"> </span><span class="kn">import</span> <span class="n">BytesIO</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.base_models</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentStream</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span>
<span class="n">buf</span> <span class="o">=</span> <span class="n">BytesIO</span><span class="p">(</span><span class="n">your_binary_stream</span><span class="p">)</span>
<span class="n">source</span> <span class="o">=</span> <span class="n">DocumentStream</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">&quot;my_doc.pdf&quot;</span><span class="p">,</span> <span class="n">stream</span><span class="o">=</span><span class="n">buf</span><span class="p">)</span>
<span class="n">converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">()</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">source</span><span class="p">)</span>
</code></pre></div>
<h4 id="limit-resource-usage">Limit resource usage</h4>
<p>You can limit the CPU threads used by Docling by setting the environment variable <code>OMP_NUM_THREADS</code> accordingly. The default setting is using 4 CPU threads.</p>
<h4 id="use-specific-backend-converters">Use specific backend converters</h4>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>This section discusses directly invoking a <a href="../concepts/architecture/">backend</a>,
i.e. using a low-level API. This should only be done when necessary. For most cases,
using a <code>DocumentConverter</code> (high-level API) as discussed in the sections above
should suffice  and is the recommended way.</p>
</div>
<p>By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of <a href="supported_formats/">supported formats</a>).
You can restrict the <code>DocumentConverter</code> to a set of allowed document formats, as shown in the <a href="../examples/run_with_formats/">Multi-format conversion</a> example.
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use <code>HTMLDocumentBackend</code> for HTML pages:</p>
<div class="highlight"><pre><span></span><code><span class="kn">import</span><span class="w"> </span><span class="nn">urllib.request</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">io</span><span class="w"> </span><span class="kn">import</span> <span class="n">BytesIO</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.backend.html_backend</span><span class="w"> </span><span class="kn">import</span> <span class="n">HTMLDocumentBackend</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.base_models</span><span class="w"> </span><span class="kn">import</span> <span class="n">InputFormat</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.document</span><span class="w"> </span><span class="kn">import</span> <span class="n">InputDocument</span>
<span class="n">url</span> <span class="o">=</span> <span class="s2">&quot;https://en.wikipedia.org/wiki/Duck&quot;</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">urllib</span><span class="o">.</span><span class="n">request</span><span class="o">.</span><span class="n">urlopen</span><span class="p">(</span><span class="n">url</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
<span class="n">in_doc</span> <span class="o">=</span> <span class="n">InputDocument</span><span class="p">(</span>
<span class="n">path_or_stream</span><span class="o">=</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">text</span><span class="p">),</span>
<span class="nb">format</span><span class="o">=</span><span class="n">InputFormat</span><span class="o">.</span><span class="n">HTML</span><span class="p">,</span>
<span class="n">backend</span><span class="o">=</span><span class="n">HTMLDocumentBackend</span><span class="p">,</span>
<span class="n">filename</span><span class="o">=</span><span class="s2">&quot;duck.html&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">backend</span> <span class="o">=</span> <span class="n">HTMLDocumentBackend</span><span class="p">(</span><span class="n">in_doc</span><span class="o">=</span><span class="n">in_doc</span><span class="p">,</span> <span class="n">path_or_stream</span><span class="o">=</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">text</span><span class="p">))</span>
<span class="n">dl_doc</span> <span class="o">=</span> <span class="n">backend</span><span class="o">.</span><span class="n">convert</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="n">dl_doc</span><span class="o">.</span><span class="n">export_to_markdown</span><span class="p">())</span>
</code></pre></div>
<h2 id="chunking">Chunking</h2>
<p>You can chunk a Docling document using a <a href="../concepts/chunking/">chunker</a>, such as a
<code>HybridChunker</code>, as shown below (for more details check out
<a href="../examples/hybrid_chunking/">this example</a>):</p>
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.chunking</span><span class="w"> </span><span class="kn">import</span> <span class="n">HybridChunker</span>
<span class="n">conv_res</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">()</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="s2">&quot;https://arxiv.org/pdf/2206.01062&quot;</span><span class="p">)</span>
<span class="n">doc</span> <span class="o">=</span> <span class="n">conv_res</span><span class="o">.</span><span class="n">document</span>
<span class="n">chunker</span> <span class="o">=</span> <span class="n">HybridChunker</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">=</span><span class="s2">&quot;BAAI/bge-small-en-v1.5&quot;</span><span class="p">)</span> <span class="c1"># set tokenizer as needed</span>
<span class="n">chunk_iter</span> <span class="o">=</span> <span class="n">chunker</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="n">doc</span><span class="p">)</span>
</code></pre></div>
<p>An example chunk would look like this:</p>
<div class="highlight"><pre><span></span><code><span class="nb">print</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">chunk_iter</span><span class="p">)[</span><span class="mi">11</span><span class="p">])</span>
<span class="c1"># {</span>
<span class="c1"># &quot;text&quot;: &quot;In this paper, we present the DocLayNet dataset. [...]&quot;,</span>
<span class="c1"># &quot;meta&quot;: {</span>
<span class="c1"># &quot;doc_items&quot;: [{</span>
<span class="c1"># &quot;self_ref&quot;: &quot;#/texts/28&quot;,</span>
<span class="c1"># &quot;label&quot;: &quot;text&quot;,</span>
<span class="c1"># &quot;prov&quot;: [{</span>
<span class="c1"># &quot;page_no&quot;: 2,</span>
<span class="c1"># &quot;bbox&quot;: {&quot;l&quot;: 53.29, &quot;t&quot;: 287.14, &quot;r&quot;: 295.56, &quot;b&quot;: 212.37, ...},</span>
<span class="c1"># }], ...,</span>
<span class="c1"># }, ...],</span>
<span class="c1"># &quot;headings&quot;: [&quot;1 INTRODUCTION&quot;],</span>
<span class="c1"># }</span>
<span class="c1"># }</span>
</code></pre></div>
</article>
</div>
<script>var tabs=__md_get("__tabs");if(Array.isArray(tabs))e:for(var set of document.querySelectorAll(".tabbed-set")){var labels=set.querySelector(".tabbed-labels");for(var tab of tabs)for(var label of labels.getElementsByTagName("label"))if(label.innerText.trim()===tab){var input=document.getElementById(label.htmlFor);input.checked=!0;continue e}}</script>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8z"/></svg>
Back to top
</button>
</main>
<footer class="md-footer">
<nav class="md-footer__inner md-grid" aria-label="Footer" >
<a href="../installation/" class="md-footer__link md-footer__link--prev" aria-label="Previous: Installation">
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</div>
<div class="md-footer__title">
<span class="md-footer__direction">
Previous
</span>
<div class="md-ellipsis">
Installation
</div>
</div>
</a>
<a href="supported_formats/" class="md-footer__link md-footer__link--next" aria-label="Next: Supported formats">
<div class="md-footer__title">
<span class="md-footer__direction">
Next
</span>
<div class="md-ellipsis">
Supported formats
</div>
</div>
<div class="md-footer__button md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11z"/></svg>
</div>
</a>
</nav>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<div class="md-progress" data-md-component="progress" role="progressbar"></div>
<script id="__config" type="application/json">{"base": "..", "features": ["content.tabs.link", "content.code.annotate", "content.code.copy", "announce.dismiss", "navigation.footer", "navigation.tabs", "navigation.indexes", "navigation.instant", "navigation.instant.prefetch", "navigation.instant.progress", "navigation.path", "navigation.sections", "navigation.top", "navigation.tracking", "search.suggest", "toc.follow"], "search": "../assets/javascripts/workers/search.d50fe291.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../assets/javascripts/bundle.56ea9cef.min.js"></script>
</body>
</html>