mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
3569 lines
76 KiB
HTML
3569 lines
76 KiB
HTML
|
|
<!doctype html>
|
|
<html lang="en" class="no-js">
|
|
<head>
|
|
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
|
|
|
|
|
|
<link rel="canonical" href="https://docling-project.github.io/docling/v2/">
|
|
|
|
|
|
|
|
|
|
<link rel="icon" href="../assets/logo.png">
|
|
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.15">
|
|
|
|
|
|
|
|
<title>V2 - Docling</title>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/main.342714a4.min.css">
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
|
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../assets/_mkdocstrings.css">
|
|
|
|
<link rel="stylesheet" href="../stylesheets/extra.css">
|
|
|
|
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</head>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="indigo">
|
|
|
|
|
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
|
<label class="md-overlay" for="__drawer"></label>
|
|
<div data-md-component="skip">
|
|
|
|
|
|
<a href="#whats-new" class="md-skip">
|
|
Skip to content
|
|
</a>
|
|
|
|
</div>
|
|
<div data-md-component="announce">
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<header class="md-header" data-md-component="header">
|
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
|
<a href=".." title="Docling" class="md-header__button md-logo" aria-label="Docling" data-md-component="logo">
|
|
|
|
<img src="../assets/logo.png" alt="logo">
|
|
|
|
</a>
|
|
<label class="md-header__button md-icon" for="__drawer">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
|
</label>
|
|
<div class="md-header__title" data-md-component="header-title">
|
|
<div class="md-header__ellipsis">
|
|
<div class="md-header__topic">
|
|
<span class="md-ellipsis">
|
|
Docling
|
|
</span>
|
|
</div>
|
|
<div class="md-header__topic" data-md-component="header-topic">
|
|
<span class="md-ellipsis">
|
|
|
|
V2
|
|
|
|
</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<form class="md-header__option" data-md-component="palette">
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="(prefers-color-scheme)" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_0">
|
|
|
|
<label class="md-header__button md-icon" title="Switch to light mode" for="__palette_1" hidden>
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m14.3 16-.7-2h-3.2l-.7 2H7.8L11 7h2l3.2 9zM20 8.69V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12zm-9.15 3.96h2.3L12 9z"/></svg>
|
|
</label>
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_1">
|
|
|
|
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_2" hidden>
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
|
|
</label>
|
|
|
|
|
|
|
|
|
|
|
|
<input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to system preference" type="radio" name="__palette" id="__palette_2">
|
|
|
|
<label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
|
|
</label>
|
|
|
|
|
|
</form>
|
|
|
|
|
|
|
|
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-header__button md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
</label>
|
|
<div class="md-search" data-md-component="search" role="dialog">
|
|
<label class="md-search__overlay" for="__search"></label>
|
|
<div class="md-search__inner" role="search">
|
|
<form class="md-search__form" name="search">
|
|
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
|
|
<label class="md-search__icon md-icon" for="__search">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
|
|
</label>
|
|
<nav class="md-search__options" aria-label="Search">
|
|
|
|
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
|
|
</button>
|
|
</nav>
|
|
|
|
<div class="md-search__suggest" data-md-component="search-suggest"></div>
|
|
|
|
</form>
|
|
<div class="md-search__output">
|
|
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
|
|
<div class="md-search-result" data-md-component="search-result">
|
|
<div class="md-search-result__meta">
|
|
Initializing search
|
|
</div>
|
|
<ol class="md-search-result__list" role="presentation"></ol>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-header__source">
|
|
<a href="https://github.com/docling-project/docling" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
docling-project/docling
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
</header>
|
|
|
|
<div class="md-container" data-md-component="container">
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-tabs" aria-label="Tabs" data-md-component="tabs">
|
|
<div class="md-grid">
|
|
<ul class="md-tabs__list">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href=".." class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Home
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../concepts/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Concepts
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../examples/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Examples
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../integrations/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Integrations
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-tabs__item">
|
|
<a href="../reference/document_converter/" class="md-tabs__link">
|
|
|
|
|
|
|
|
|
|
|
|
Reference
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</div>
|
|
</nav>
|
|
|
|
|
|
|
|
<main class="md-main" data-md-component="main">
|
|
<div class="md-main__inner md-grid">
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0">
|
|
<label class="md-nav__title" for="__drawer">
|
|
<a href=".." title="Docling" class="md-nav__button md-logo" aria-label="Docling" data-md-component="logo">
|
|
|
|
<img src="../assets/logo.png" alt="logo">
|
|
|
|
</a>
|
|
Docling
|
|
</label>
|
|
|
|
<div class="md-nav__source">
|
|
<a href="https://github.com/docling-project/docling" title="Go to repository" class="md-source" data-md-component="source">
|
|
<div class="md-source__icon md-icon">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.7.2 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
|
|
</div>
|
|
<div class="md-source__repository">
|
|
docling-project/docling
|
|
</div>
|
|
</a>
|
|
</div>
|
|
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href=".." class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Home
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_1" id="__nav_1_label" tabindex="0">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_1_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_1">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Home
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1_2" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="../installation/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Installation
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_1_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Installation
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1_3" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="../usage/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Usage
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_1_3" id="__nav_1_3_label" tabindex="0">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_1_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Usage
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../usage/supported_formats/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Supported formats
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../usage/enrichments/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Enrichment features
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../usage/vision_models/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Vision models
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_1_4" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="../faq/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
FAQ
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_1_4_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_1_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
FAQ
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="../concepts/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Concepts
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_2" id="__nav_2_label" tabindex="0">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Concepts
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../concepts/architecture/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Architecture
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../concepts/docling_document/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Docling Document
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../concepts/serialization/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Serialization
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../concepts/confidence_scores/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Confidence Scores
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../concepts/chunking/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Chunking
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../concepts/plugins/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Plugins
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="../examples/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Examples
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_3" id="__nav_3_label" tabindex="0">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Examples
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_2" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3_2" id="__nav_3_2_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
🔀 Conversion
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
🔀 Conversion
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/minimal/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Simple conversion
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/custom_convert/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Custom conversion
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/batch_convert/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Batch conversion
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/run_with_formats/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Multi-format conversion
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/minimal_vlm_pipeline/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
VLM pipeline with SmolDocling
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/vlm_pipeline_api_model/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
VLM pipeline with remote model
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/compare_vlm_models/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Compare VLM models
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/minimal_asr_pipeline/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
ASR pipeline with Whisper
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/export_figures/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Figure export
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/export_tables/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Table export
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/export_multimodal/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Multimodal export
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/full_page_ocr/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Force full page OCR
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/tesseract_lang_detection/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Automatic OCR language detection with tesseract
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/rapidocr_with_custom_models/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RapidOCR with custom OCR models
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/run_with_accelerator/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Accelerator options
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/translate/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Simple translation
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/backend_csv/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Conversion of CSV files
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/backend_xml_rag/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Conversion of custom XML
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_3" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3_3" id="__nav_3_3_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
✂️ Serialization & chunking
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
✂️ Serialization & chunking
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/serialization/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Serialization
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/hybrid_chunking/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Hybrid chunking
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/advanced_chunking_and_serialization/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Advanced chunking & serialization
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_4" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3_4" id="__nav_3_4_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
🤖 RAG with AI dev frameworks
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_4_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
🤖 RAG with AI dev frameworks
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/rag_haystack/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RAG with Haystack
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/rag_langchain/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RAG with LangChain
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/rag_llamaindex/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RAG with LlamaIndex
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/visual_grounding/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Visual grounding
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_5" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3_5" id="__nav_3_5_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
🖼️ Picture annotation
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_5_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3_5">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
🖼️ Picture annotation
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/pictures_description/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Annotate picture with local VLM
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/pictures_description_api/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Annotate picture with remote VLM
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_6" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3_6" id="__nav_3_6_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
✨ Enrichment development
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_6_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3_6">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
✨ Enrichment development
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/develop_picture_enrichment/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Figure enrichment
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/develop_formula_understanding/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Formula enrichment
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/enrich_doclingdocument/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Enrich DoclingDocument
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_7" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3_7" id="__nav_3_7_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
🗂️ More examples
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_7_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3_7">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
🗂️ More examples
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/rag_milvus/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RAG with Milvus
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/rag_weaviate/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RAG with Weaviate
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RAG with Granite [↗]
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/rag_azuresearch/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RAG with Azure AI Search
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../examples/retrieval_qdrant/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Retrieval with Qdrant
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
|
|
|
|
|
<div class="md-nav__link md-nav__container">
|
|
<a href="../integrations/" class="md-nav__link ">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Integrations
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<label class="md-nav__link " for="__nav_4" id="__nav_4_label" tabindex="0">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
</div>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Integrations
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4_2" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_4_2" id="__nav_4_2_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
🤖 Agentic / AI dev frameworks
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_4_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_4_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
🤖 Agentic / AI dev frameworks
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/bee/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Bee Agent Framework
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/crewai/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Crew AI
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/haystack/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Haystack
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/langchain/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
LangChain
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/llamaindex/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
LlamaIndex
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/txtai/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
txtai
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4_3" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_4_3" id="__nav_4_3_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
⭐️ Featured
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_4_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_4_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
⭐️ Featured
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/apify/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Apify
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/data_prep_kit/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Data Prep Kit
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/instructlab/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
InstructLab
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/nvidia/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
NVIDIA
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/prodigy/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Prodigy
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/rhel_ai/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
RHEL AI
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/spacy/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
spaCy
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4_4" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_4_4" id="__nav_4_4_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
🗂️ More integrations
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_4_4_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_4_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
🗂️ More integrations
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/cloudera/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Cloudera
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/docetl/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
DocETL
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/kotaemon/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Kotaemon
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/opencontracts/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
OpenContracts
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/openwebui/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Open WebUI
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../integrations/vectara/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Vectara
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Reference
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_5">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Reference
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5_1" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_5_1" id="__nav_5_1_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Python API
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_5_1_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_5_1">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Python API
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../reference/document_converter/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Document Converter
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../reference/pipeline_options/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Pipeline options
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../reference/docling_document/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
Docling Document
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5_2" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_5_2" id="__nav_5_2_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
CLI
|
|
|
|
</span>
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_5_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_5_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
CLI
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../reference/cli/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
CLI reference
|
|
|
|
</span>
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#whats-new" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
What's new
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#changes-in-docling-v2" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Changes in Docling v2
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Changes in Docling v2">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#cli" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
CLI
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#setting-up-a-documentconverter" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Setting up a DocumentConverter
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#converting-documents" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Converting documents
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#access-document-structures" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Access document structures
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#export-into-json-markdown-doctags" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Export into JSON, Markdown, Doctags
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#reload-a-doclingdocument-stored-as-json" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Reload a DoclingDocument stored as JSON
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#chunking" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
Chunking
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-content" data-md-component="content">
|
|
<article class="md-content__inner md-typeset">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<h1>V2</h1>
|
|
|
|
<h2 id="whats-new">What's new</h2>
|
|
<p>Docling v2 introduces several new features:</p>
|
|
<ul>
|
|
<li>Understands and converts PDF, MS Word, MS Powerpoint, HTML and several image formats</li>
|
|
<li>Produces a new, universal document representation which can encapsulate document hierarchy</li>
|
|
<li>Comes with a fresh new API and CLI</li>
|
|
</ul>
|
|
<h2 id="changes-in-docling-v2">Changes in Docling v2</h2>
|
|
<h3 id="cli">CLI</h3>
|
|
<p>We updated the command line syntax of Docling v2 to support many formats. Examples are seen below.
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Convert a single file to Markdown (default)</span>
|
|
docling<span class="w"> </span>myfile.pdf
|
|
|
|
<span class="c1"># Convert a single file to Markdown and JSON, without OCR</span>
|
|
docling<span class="w"> </span>myfile.pdf<span class="w"> </span>--to<span class="w"> </span>json<span class="w"> </span>--to<span class="w"> </span>md<span class="w"> </span>--no-ocr
|
|
|
|
<span class="c1"># Convert PDF files in input directory to Markdown (default)</span>
|
|
docling<span class="w"> </span>./input/dir<span class="w"> </span>--from<span class="w"> </span>pdf
|
|
|
|
<span class="c1"># Convert PDF and Word files in input directory to Markdown and JSON</span>
|
|
docling<span class="w"> </span>./input/dir<span class="w"> </span>--from<span class="w"> </span>pdf<span class="w"> </span>--from<span class="w"> </span>docx<span class="w"> </span>--to<span class="w"> </span>md<span class="w"> </span>--to<span class="w"> </span>json<span class="w"> </span>--output<span class="w"> </span>./scratch
|
|
|
|
<span class="c1"># Convert all supported files in input directory to Markdown, but abort on first error</span>
|
|
docling<span class="w"> </span>./input/dir<span class="w"> </span>--output<span class="w"> </span>./scratch<span class="w"> </span>--abort-on-error
|
|
</code></pre></div></p>
|
|
<p><strong>Notable changes from Docling v1:</strong></p>
|
|
<ul>
|
|
<li>The standalone switches for different export formats are removed, and replaced with <code>--from</code> and <code>--to</code> arguments, to define input and output formats respectively.</li>
|
|
<li>The new <code>--abort-on-error</code> will abort any batch conversion as soon an error is encountered</li>
|
|
<li>The <code>--backend</code> option for PDFs was removed</li>
|
|
</ul>
|
|
<h3 id="setting-up-a-documentconverter">Setting up a <code>DocumentConverter</code></h3>
|
|
<p>To accommodate many input formats, we changed the way you need to set up your <code>DocumentConverter</code> object.
|
|
You can now define a list of allowed formats on the <code>DocumentConverter</code> initialization, and specify custom options
|
|
per-format if desired. By default, all supported formats are allowed. If you don't provide <code>format_options</code>, defaults
|
|
will be used for all <code>allowed_formats</code>.</p>
|
|
<p>Format options can include the pipeline class to use, the options to provide to the pipeline, and the document backend.
|
|
They are provided as format-specific types, such as <code>PdfFormatOption</code> or <code>WordFormatOption</code>, as seen below.</p>
|
|
<div class="highlight"><pre><span></span><code><span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="n">DocumentConverter</span>
|
|
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.base_models</span><span class="w"> </span><span class="kn">import</span> <span class="n">InputFormat</span>
|
|
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.document_converter</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span>
|
|
<span class="n">DocumentConverter</span><span class="p">,</span>
|
|
<span class="n">PdfFormatOption</span><span class="p">,</span>
|
|
<span class="n">WordFormatOption</span><span class="p">,</span>
|
|
<span class="p">)</span>
|
|
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.pipeline.simple_pipeline</span><span class="w"> </span><span class="kn">import</span> <span class="n">SimplePipeline</span>
|
|
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.pipeline.standard_pdf_pipeline</span><span class="w"> </span><span class="kn">import</span> <span class="n">StandardPdfPipeline</span>
|
|
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.pipeline_options</span><span class="w"> </span><span class="kn">import</span> <span class="n">PdfPipelineOptions</span>
|
|
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.backend.pypdfium2_backend</span><span class="w"> </span><span class="kn">import</span> <span class="n">PyPdfiumDocumentBackend</span>
|
|
|
|
<span class="c1">## Default initialization still works as before:</span>
|
|
<span class="c1"># doc_converter = DocumentConverter()</span>
|
|
|
|
|
|
<span class="c1"># previous `PipelineOptions` is now `PdfPipelineOptions`</span>
|
|
<span class="n">pipeline_options</span> <span class="o">=</span> <span class="n">PdfPipelineOptions</span><span class="p">()</span>
|
|
<span class="n">pipeline_options</span><span class="o">.</span><span class="n">do_ocr</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">pipeline_options</span><span class="o">.</span><span class="n">do_table_structure</span> <span class="o">=</span> <span class="kc">True</span>
|
|
<span class="c1">#...</span>
|
|
|
|
<span class="c1">## Custom options are now defined per format.</span>
|
|
<span class="n">doc_converter</span> <span class="o">=</span> <span class="p">(</span>
|
|
<span class="n">DocumentConverter</span><span class="p">(</span> <span class="c1"># all of the below is optional, has internal defaults.</span>
|
|
<span class="n">allowed_formats</span><span class="o">=</span><span class="p">[</span>
|
|
<span class="n">InputFormat</span><span class="o">.</span><span class="n">PDF</span><span class="p">,</span>
|
|
<span class="n">InputFormat</span><span class="o">.</span><span class="n">IMAGE</span><span class="p">,</span>
|
|
<span class="n">InputFormat</span><span class="o">.</span><span class="n">DOCX</span><span class="p">,</span>
|
|
<span class="n">InputFormat</span><span class="o">.</span><span class="n">HTML</span><span class="p">,</span>
|
|
<span class="n">InputFormat</span><span class="o">.</span><span class="n">PPTX</span><span class="p">,</span>
|
|
<span class="p">],</span> <span class="c1"># whitelist formats, non-matching files are ignored.</span>
|
|
<span class="n">format_options</span><span class="o">=</span><span class="p">{</span>
|
|
<span class="n">InputFormat</span><span class="o">.</span><span class="n">PDF</span><span class="p">:</span> <span class="n">PdfFormatOption</span><span class="p">(</span>
|
|
<span class="n">pipeline_options</span><span class="o">=</span><span class="n">pipeline_options</span><span class="p">,</span> <span class="c1"># pipeline options go here.</span>
|
|
<span class="n">backend</span><span class="o">=</span><span class="n">PyPdfiumDocumentBackend</span> <span class="c1"># optional: pick an alternative backend</span>
|
|
<span class="p">),</span>
|
|
<span class="n">InputFormat</span><span class="o">.</span><span class="n">DOCX</span><span class="p">:</span> <span class="n">WordFormatOption</span><span class="p">(</span>
|
|
<span class="n">pipeline_cls</span><span class="o">=</span><span class="n">SimplePipeline</span> <span class="c1"># default for office formats and HTML</span>
|
|
<span class="p">),</span>
|
|
<span class="p">},</span>
|
|
<span class="p">)</span>
|
|
<span class="p">)</span>
|
|
</code></pre></div>
|
|
<p><strong>Note</strong>: If you work only with defaults, all remains the same as in Docling v1.</p>
|
|
<p>More options are shown in the following example units:</p>
|
|
<ul>
|
|
<li><a href="../examples/run_with_formats/">run_with_formats.py</a></li>
|
|
<li><a href="../examples/custom_convert/">custom_convert.py</a></li>
|
|
</ul>
|
|
<h3 id="converting-documents">Converting documents</h3>
|
|
<p>We have simplified the way you can feed input to the <code>DocumentConverter</code> and renamed the conversion methods for
|
|
better semantics. You can now call the conversion directly with a single file, or a list of input files,
|
|
or <code>DocumentStream</code> objects, without constructing a <code>DocumentConversionInput</code> object first.</p>
|
|
<ul>
|
|
<li><code>DocumentConverter.convert</code> now converts a single file input (previously <code>DocumentConverter.convert_single</code>).</li>
|
|
<li><code>DocumentConverter.convert_all</code> now converts many files at once (previously <code>DocumentConverter.convert</code>).</li>
|
|
</ul>
|
|
<p><div class="highlight"><pre><span></span><code><span class="o">...</span>
|
|
<span class="kn">from</span><span class="w"> </span><span class="nn">docling.datamodel.document</span><span class="w"> </span><span class="kn">import</span> <span class="n">ConversionResult</span>
|
|
<span class="c1">## Convert a single file (from URL or local path)</span>
|
|
<span class="n">conv_result</span><span class="p">:</span> <span class="n">ConversionResult</span> <span class="o">=</span> <span class="n">doc_converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="s2">"https://arxiv.org/pdf/2408.09869"</span><span class="p">)</span> <span class="c1"># previously `convert_single`</span>
|
|
|
|
<span class="c1">## Convert several files at once:</span>
|
|
|
|
<span class="n">input_files</span> <span class="o">=</span> <span class="p">[</span>
|
|
<span class="s2">"tests/data/html/wiki_duck.html"</span><span class="p">,</span>
|
|
<span class="s2">"tests/data/docx/word_sample.docx"</span><span class="p">,</span>
|
|
<span class="s2">"tests/data/docx/lorem_ipsum.docx"</span><span class="p">,</span>
|
|
<span class="s2">"tests/data/pptx/powerpoint_sample.pptx"</span><span class="p">,</span>
|
|
<span class="s2">"tests/data/2305.03393v1-pg9-img.png"</span><span class="p">,</span>
|
|
<span class="s2">"tests/data/pdf/2206.01062.pdf"</span><span class="p">,</span>
|
|
<span class="p">]</span>
|
|
|
|
<span class="c1"># Directly pass list of files or streams to `convert_all`</span>
|
|
<span class="n">conv_results_iter</span> <span class="o">=</span> <span class="n">doc_converter</span><span class="o">.</span><span class="n">convert_all</span><span class="p">(</span><span class="n">input_files</span><span class="p">)</span> <span class="c1"># previously `convert`</span>
|
|
</code></pre></div>
|
|
Through the <code>raises_on_error</code> argument, you can also control if the conversion should raise exceptions when first
|
|
encountering a problem, or resiliently convert all files first and reflect errors in each file's conversion status.
|
|
By default, any error is immediately raised and the conversion aborts (previously, exceptions were swallowed).</p>
|
|
<div class="highlight"><pre><span></span><code><span class="o">...</span>
|
|
<span class="n">conv_results_iter</span> <span class="o">=</span> <span class="n">doc_converter</span><span class="o">.</span><span class="n">convert_all</span><span class="p">(</span><span class="n">input_files</span><span class="p">,</span> <span class="n">raises_on_error</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="c1"># previously `convert`</span>
|
|
</code></pre></div>
|
|
<h3 id="access-document-structures">Access document structures</h3>
|
|
<p>We have simplified how you can access and export the converted document data, too. Our universal document representation
|
|
is now available in conversion results as a <code>DoclingDocument</code> object.
|
|
<code>DoclingDocument</code> provides a neat set of APIs to construct, iterate and export content in the document, as shown below.</p>
|
|
<div class="highlight"><pre><span></span><code><span class="n">conv_result</span><span class="p">:</span> <span class="n">ConversionResult</span> <span class="o">=</span> <span class="n">doc_converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="s2">"https://arxiv.org/pdf/2408.09869"</span><span class="p">)</span> <span class="c1"># previously `convert_single`</span>
|
|
|
|
<span class="c1">## Inspect the converted document:</span>
|
|
<span class="n">conv_result</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">print_element_tree</span><span class="p">()</span>
|
|
|
|
<span class="c1">## Iterate the elements in reading order, including hierarchy level:</span>
|
|
<span class="k">for</span> <span class="n">item</span><span class="p">,</span> <span class="n">level</span> <span class="ow">in</span> <span class="n">conv_result</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">iterate_items</span><span class="p">():</span>
|
|
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="n">TextItem</span><span class="p">):</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">item</span><span class="o">.</span><span class="n">text</span><span class="p">)</span>
|
|
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="n">TableItem</span><span class="p">):</span>
|
|
<span class="n">table_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span> <span class="o">=</span> <span class="n">item</span><span class="o">.</span><span class="n">export_to_dataframe</span><span class="p">()</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">table_df</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">())</span>
|
|
<span class="k">elif</span> <span class="o">...</span><span class="p">:</span>
|
|
<span class="c1">#...</span>
|
|
</code></pre></div>
|
|
<p><strong>Note</strong>: While it is deprecated, you can <em>still</em> work with the Docling v1 document representation, it is available as:
|
|
<div class="highlight"><pre><span></span><code>conv_result.legacy_document<span class="w"> </span><span class="c1"># provides the representation in previous ExportedCCSDocument type</span>
|
|
</code></pre></div></p>
|
|
<h3 id="export-into-json-markdown-doctags">Export into JSON, Markdown, Doctags</h3>
|
|
<p><strong>Note</strong>: All <code>render_...</code> methods in <code>ConversionResult</code> have been removed in Docling v2,
|
|
and are now available on <code>DoclingDocument</code> as:</p>
|
|
<ul>
|
|
<li><code>DoclingDocument.export_to_dict</code></li>
|
|
<li><code>DoclingDocument.export_to_markdown</code></li>
|
|
<li><code>DoclingDocument.export_to_document_tokens</code></li>
|
|
</ul>
|
|
<div class="highlight"><pre><span></span><code><span class="n">conv_result</span><span class="p">:</span> <span class="n">ConversionResult</span> <span class="o">=</span> <span class="n">doc_converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="s2">"https://arxiv.org/pdf/2408.09869"</span><span class="p">)</span> <span class="c1"># previously `convert_single`</span>
|
|
|
|
<span class="c1">## Export to desired format:</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">conv_res</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">export_to_dict</span><span class="p">()))</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">conv_res</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">export_to_markdown</span><span class="p">())</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">conv_res</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">export_to_document_tokens</span><span class="p">())</span>
|
|
</code></pre></div>
|
|
<p><strong>Note</strong>: While it is deprecated, you can <em>still</em> export Docling v1 JSON format. This is available through the same
|
|
methods as on the <code>DoclingDocument</code> type:
|
|
<div class="highlight"><pre><span></span><code><span class="c1">## Export legacy document representation to desired format, for v1 compatibility:</span>
|
|
print<span class="o">(</span>json.dumps<span class="o">(</span>conv_res.legacy_document.export_to_dict<span class="o">()))</span>
|
|
print<span class="o">(</span>conv_res.legacy_document.export_to_markdown<span class="o">())</span>
|
|
print<span class="o">(</span>conv_res.legacy_document.export_to_document_tokens<span class="o">())</span>
|
|
</code></pre></div></p>
|
|
<h3 id="reload-a-doclingdocument-stored-as-json">Reload a <code>DoclingDocument</code> stored as JSON</h3>
|
|
<p>You can save and reload a <code>DoclingDocument</code> to disk in JSON format using the following codes:</p>
|
|
<div class="highlight"><pre><span></span><code><span class="c1"># Save to disk:</span>
|
|
<span class="n">doc</span><span class="p">:</span> <span class="n">DoclingDocument</span> <span class="o">=</span> <span class="n">conv_res</span><span class="o">.</span><span class="n">document</span> <span class="c1"># produced from conversion result...</span>
|
|
|
|
<span class="k">with</span> <span class="n">Path</span><span class="p">(</span><span class="s2">"./doc.json"</span><span class="p">)</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">fp</span><span class="p">:</span>
|
|
<span class="n">fp</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">doc</span><span class="o">.</span><span class="n">export_to_dict</span><span class="p">()))</span> <span class="c1"># use `export_to_dict` to ensure consistency</span>
|
|
|
|
<span class="c1"># Load from disk:</span>
|
|
<span class="k">with</span> <span class="n">Path</span><span class="p">(</span><span class="s2">"./doc.json"</span><span class="p">)</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="s2">"r"</span><span class="p">)</span> <span class="k">as</span> <span class="n">fp</span><span class="p">:</span>
|
|
<span class="n">doc_dict</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">fp</span><span class="o">.</span><span class="n">read</span><span class="p">())</span>
|
|
<span class="n">doc</span> <span class="o">=</span> <span class="n">DoclingDocument</span><span class="o">.</span><span class="n">model_validate</span><span class="p">(</span><span class="n">doc_dict</span><span class="p">)</span> <span class="c1"># use standard pydantic API to populate doc</span>
|
|
</code></pre></div>
|
|
<h3 id="chunking">Chunking</h3>
|
|
<p>Docling v2 defines new base classes for chunking:</p>
|
|
<ul>
|
|
<li><code>BaseMeta</code> for chunk metadata</li>
|
|
<li><code>BaseChunk</code> containing the chunk text and metadata, and</li>
|
|
<li><code>BaseChunker</code> for chunkers, producing chunks out of a <code>DoclingDocument</code>.</li>
|
|
</ul>
|
|
<p>Additionally, it provides an updated <code>HierarchicalChunker</code> implementation, which
|
|
leverages the new <code>DoclingDocument</code> and provides a new, richer chunk output format, including:</p>
|
|
<ul>
|
|
<li>the respective doc items for grounding</li>
|
|
<li>any applicable headings for context</li>
|
|
<li>any applicable captions for context</li>
|
|
</ul>
|
|
<p>For an example, check out <a href="usage.md#chunking">Chunking usage</a>.</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</article>
|
|
</div>
|
|
|
|
|
|
<script>var tabs=__md_get("__tabs");if(Array.isArray(tabs))e:for(var set of document.querySelectorAll(".tabbed-set")){var labels=set.querySelector(".tabbed-labels");for(var tab of tabs)for(var label of labels.getElementsByTagName("label"))if(label.innerText.trim()===tab){var input=document.getElementById(label.htmlFor);input.checked=!0;continue e}}</script>
|
|
|
|
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
|
</div>
|
|
|
|
<button type="button" class="md-top md-icon" data-md-component="top" hidden>
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8z"/></svg>
|
|
Back to top
|
|
</button>
|
|
|
|
</main>
|
|
|
|
<footer class="md-footer">
|
|
|
|
|
|
|
|
<div class="md-footer-meta md-typeset">
|
|
<div class="md-footer-meta__inner md-grid">
|
|
<div class="md-copyright">
|
|
|
|
|
|
Made with
|
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
|
Material for MkDocs
|
|
</a>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
</div>
|
|
<div class="md-dialog" data-md-component="dialog">
|
|
<div class="md-dialog__inner md-typeset"></div>
|
|
</div>
|
|
|
|
<div class="md-progress" data-md-component="progress" role="progressbar"></div>
|
|
|
|
|
|
|
|
|
|
<script id="__config" type="application/json">{"base": "..", "features": ["content.tabs.link", "content.code.annotate", "content.code.copy", "announce.dismiss", "navigation.footer", "navigation.tabs", "navigation.indexes", "navigation.instant", "navigation.instant.prefetch", "navigation.instant.progress", "navigation.path", "navigation.sections", "navigation.top", "navigation.tracking", "search.suggest", "toc.follow"], "search": "../assets/javascripts/workers/search.d50fe291.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
|
|
|
|
|
<script src="../assets/javascripts/bundle.56ea9cef.min.js"></script>
|
|
|
|
|
|
</body>
|
|
</html> |