Merge branch 'release_v3' of github.com:DS4SD/docling into cau/layout-postprocessing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-12-04 14:21:09 +01:00
commit e97688cd3d
27 changed files with 1581 additions and 835 deletions

View File

@ -1,3 +1,38 @@
## [v2.8.3](https://github.com/DS4SD/docling/releases/tag/v2.8.3) - 2024-12-03
### Fix
* Improve handling of disallowed formats ([#429](https://github.com/DS4SD/docling/issues/429)) ([`34c7c79`](https://github.com/DS4SD/docling/commit/34c7c798580476a86ce8abec30b1115fbb36fdd8))
## [v2.8.2](https://github.com/DS4SD/docling/releases/tag/v2.8.2) - 2024-12-03
### Fix
* ParserError EOF inside string (#470) ([#472](https://github.com/DS4SD/docling/issues/472)) ([`c90c41c`](https://github.com/DS4SD/docling/commit/c90c41c391de4366db554d7a71ce9a35467c981e))
* PermissionError when using tesseract_ocr_cli_model ([#496](https://github.com/DS4SD/docling/issues/496)) ([`d3f84b2`](https://github.com/DS4SD/docling/commit/d3f84b2457125feacd0c21d6513e7ae69a308ea5))
### Documentation
* Add styling for faq ([#502](https://github.com/DS4SD/docling/issues/502)) ([`5ba3807`](https://github.com/DS4SD/docling/commit/5ba3807f315a01b1a4e8df9bab40e34a4238205a))
* Typo in faq ([#484](https://github.com/DS4SD/docling/issues/484)) ([`33cff98`](https://github.com/DS4SD/docling/commit/33cff98d360c02a382a66850c696a0cf511659ac))
* Add automatic api reference ([#475](https://github.com/DS4SD/docling/issues/475)) ([`d487210`](https://github.com/DS4SD/docling/commit/d4872103b8f24e38b37a8cd3ac414d3e02e7d6e8))
* Introduce faq section ([#468](https://github.com/DS4SD/docling/issues/468)) ([`8ccb3c6`](https://github.com/DS4SD/docling/commit/8ccb3c6db69318789af7deec26cfa2a3fd71302e))
### Performance
* Prevent temp file leftovers, reuse core type ([#487](https://github.com/DS4SD/docling/issues/487)) ([`051789d`](https://github.com/DS4SD/docling/commit/051789d01706d3823dd6307eca4dc5faacd1b7ce))
## [v2.8.1](https://github.com/DS4SD/docling/releases/tag/v2.8.1) - 2024-11-29
### Fix
* **cli:** Expose debug options ([#467](https://github.com/DS4SD/docling/issues/467)) ([`dd8de46`](https://github.com/DS4SD/docling/commit/dd8de462676993b81926610fd573d51d3272cbaf))
* Remove unused deps ([#466](https://github.com/DS4SD/docling/issues/466)) ([`af63818`](https://github.com/DS4SD/docling/commit/af63818df5636c4cbe77c0a01e6dcc0d47c4bfdb))
### Documentation
* Extend integration docs & README ([#456](https://github.com/DS4SD/docling/issues/456)) ([`84c46fd`](https://github.com/DS4SD/docling/commit/84c46fdeb344502edf9647c610c4828ab0ffb9dd))
## [v2.8.0](https://github.com/DS4SD/docling/releases/tag/v2.8.0) - 2024-11-27 ## [v2.8.0](https://github.com/DS4SD/docling/releases/tag/v2.8.0) - 2024-11-27
### Feature ### Feature

View File

@ -4,7 +4,7 @@
</a> </a>
</p> </p>
# Docling # 🦆 Docling
<p align="center"> <p align="center">
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
* 📑 Advanced PDF document understanding including page layout, reading order & table structures * 📑 Advanced PDF document understanding including page layout, reading order & table structures
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
* 🔍 OCR support for scanned PDFs * 🔍 OCR support for scanned PDFs
* 💻 Simple and convenient CLI * 💻 Simple and convenient CLI
@ -65,8 +65,24 @@ result = converter.convert(source)
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]" print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
``` ```
Check out [Getting started](https://ds4sd.github.io/docling/). More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
You will find lots of tuning options to leverage all the advanced capabilities. the docs.
## Documentation
Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
installation, usage, concepts, recipes, extensions, and more.
## Examples
Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
demonstrating how to address different application use cases with Docling.
## Integrations
To further accelerate your AI application development, check out Docling's native
[integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
and tools.
## Get help and support ## Get help and support

View File

@ -2,6 +2,7 @@ import importlib
import json import json
import logging import logging
import re import re
import tempfile
import time import time
import warnings import warnings
from enum import Enum from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer import typer
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_source_to_path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -32,6 +33,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@ -212,6 +214,24 @@ def convert(
help="Set the verbosity level. -v for info logging, -vv for debug logging.", help="Set the verbosity level. -v for info logging, -vv for debug logging.",
), ),
] = 0, ] = 0,
debug_visualize_cells: Annotated[
bool,
typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
] = False,
debug_visualize_ocr: Annotated[
bool,
typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
] = False,
debug_visualize_layout: Annotated[
bool,
typer.Option(
..., help="Enable debug output which visualizes the layour clusters"
),
] = False,
debug_visualize_tables: Annotated[
bool,
typer.Option(..., help="Enable debug output which visualizes the table cells"),
] = False,
version: Annotated[ version: Annotated[
Optional[bool], Optional[bool],
typer.Option( typer.Option(
@ -229,98 +249,106 @@ def convert(
elif verbose == 2: elif verbose == 2:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
settings.debug.visualize_cells = debug_visualize_cells
settings.debug.visualize_layout = debug_visualize_layout
settings.debug.visualize_tables = debug_visualize_tables
settings.debug.visualize_ocr = debug_visualize_ocr
if from_formats is None: if from_formats is None:
from_formats = [e for e in InputFormat] from_formats = [e for e in InputFormat]
input_doc_paths: List[Path] = [] with tempfile.TemporaryDirectory() as tempdir:
for src in input_sources: input_doc_paths: List[Path] = []
source = resolve_file_source(source=src) for src in input_sources:
if not source.exists(): source = resolve_source_to_path(source=src, workdir=Path(tempdir))
err_console.print( if not source.exists():
f"[red]Error: The input file {source} does not exist.[/red]" err_console.print(
) f"[red]Error: The input file {source} does not exist.[/red]"
raise typer.Abort() )
elif source.is_dir(): raise typer.Abort()
for fmt in from_formats: elif source.is_dir():
for ext in FormatToExtensions[fmt]: for fmt in from_formats:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}"))) for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}"))) input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
else:
input_doc_paths.append(source)
if to_formats is None:
to_formats = [OutputFormat.MARKDOWN]
export_json = OutputFormat.JSON in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else: else:
input_doc_paths.append(source) raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
if to_formats is None: ocr_lang_list = _split_list(ocr_lang)
to_formats = [OutputFormat.MARKDOWN] if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list
export_json = OutputFormat.JSON in to_formats pipeline_options = PdfPipelineOptions(
export_md = OutputFormat.MARKDOWN in to_formats do_ocr=ocr,
export_txt = OutputFormat.TEXT in to_formats ocr_options=ocr_options,
export_doctags = OutputFormat.DOCTAGS in to_formats do_table_structure=True,
if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list
pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
if pdf_backend == PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
) )
} pipeline_options.table_structure_options.do_cell_matching = (
doc_converter = DocumentConverter( True # do_cell_matching
allowed_formats=from_formats, )
format_options=format_options, pipeline_options.table_structure_options.mode = table_mode
)
start_time = time.time() if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
conv_results = doc_converter.convert_all( if pdf_backend == PdfBackend.DLPARSE_V1:
input_doc_paths, raises_on_error=abort_on_error backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
) elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
output.mkdir(parents=True, exist_ok=True) format_options: Dict[InputFormat, FormatOption] = {
export_documents( InputFormat.PDF: PdfFormatOption(
conv_results, pipeline_options=pipeline_options,
output_dir=output, backend=backend, # pdf_backend
export_json=export_json, )
export_md=export_md, }
export_txt=export_txt, doc_converter = DocumentConverter(
export_doctags=export_doctags, allowed_formats=from_formats,
) format_options=format_options,
)
end_time = time.time() - start_time start_time = time.time()
conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"All documents were converted in {end_time:.2f} seconds.")

View File

@ -1,5 +1,4 @@
from enum import Enum, auto from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Union from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
@ -9,6 +8,9 @@ from docling_core.types.doc import (
Size, Size,
TableCell, TableCell,
) )
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
)
from PIL.Image import Image from PIL.Image import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
FAILURE = auto() FAILURE = auto()
SUCCESS = auto() SUCCESS = auto()
PARTIAL_SUCCESS = auto() PARTIAL_SUCCESS = auto()
SKIPPED = auto()
class InputFormat(str, Enum): class InputFormat(str, Enum):
@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
DOCUMENT_BACKEND = auto() DOCUMENT_BACKEND = auto()
MODEL = auto() MODEL = auto()
DOC_ASSEMBLER = auto() DOC_ASSEMBLER = auto()
USER_INPUT = auto()
class ErrorItem(BaseModel): class ErrorItem(BaseModel):
@ -214,10 +218,3 @@ class Page(BaseModel):
@property @property
def image(self) -> Optional[Image]: def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale) return self.get_image(scale=self._default_image_scale)
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
name: str
stream: BytesIO

View File

@ -3,7 +3,7 @@ import re
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
import filetype import filetype
from docling_core.types.doc import ( from docling_core.types.doc import (
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
) )
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
@ -166,12 +166,6 @@ class InputDocument(BaseModel):
backend: Type[AbstractDocumentBackend], backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path], path_or_stream: Union[BytesIO, Path],
) -> None: ) -> None:
if backend is None:
raise RuntimeError(
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
self._backend = backend(self, path_or_stream=path_or_stream) self._backend = backend(self, path_or_stream=path_or_stream)
if not self._backend.is_valid(): if not self._backend.is_valid():
self.valid = False self.valid = False
@ -452,6 +446,25 @@ class ConversionResult(BaseModel):
return ds_doc return ds_doc
class _DummyBackend(AbstractDocumentBackend):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def is_valid(self) -> bool:
return False
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return set()
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
return super().unload()
class _DocumentConversionInput(BaseModel): class _DocumentConversionInput(BaseModel):
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@ -461,13 +474,14 @@ class _DocumentConversionInput(BaseModel):
self, format_options: Dict[InputFormat, "FormatOption"] self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator: for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj) format = self._guess_format(obj)
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys(): if format not in format_options.keys():
_log.info( _log.error(
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats." f"Input document {obj.name} does not match any allowed format."
) )
continue backend = _DummyBackend
else: else:
backend = format_options[format].backend backend = format_options[format].backend

View File

@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
class TableFormerMode(str, Enum): class TableFormerMode(str, Enum):
"""Modes for the TableFormer model."""
FAST = "fast" FAST = "fast"
ACCURATE = "accurate" ACCURATE = "accurate"
class TableStructureOptions(BaseModel): class TableStructureOptions(BaseModel):
"""Options for the table structure."""
do_cell_matching: bool = ( do_cell_matching: bool = (
True True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells # True: Matches predictions back to PDF cells. Can break table output if PDF cells
@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
class OcrOptions(BaseModel): class OcrOptions(BaseModel):
"""OCR options."""
kind: str kind: str
lang: List[str] lang: List[str]
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
class RapidOcrOptions(OcrOptions): class RapidOcrOptions(OcrOptions):
"""Options for the RapidOCR engine."""
kind: Literal["rapidocr"] = "rapidocr" kind: Literal["rapidocr"] = "rapidocr"
# English and chinese are the most commly used models and have been tested with RapidOCR. # English and chinese are the most commly used models and have been tested with RapidOCR.
@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
class EasyOcrOptions(OcrOptions): class EasyOcrOptions(OcrOptions):
"""Options for the EasyOCR engine."""
kind: Literal["easyocr"] = "easyocr" kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"] lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader use_gpu: bool = True # same default as easyocr.Reader
@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
class TesseractCliOcrOptions(OcrOptions): class TesseractCliOcrOptions(OcrOptions):
"""Options for the TesseractCli engine."""
kind: Literal["tesseract"] = "tesseract" kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"] lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract" tesseract_cmd: str = "tesseract"
@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
class TesseractOcrOptions(OcrOptions): class TesseractOcrOptions(OcrOptions):
"""Options for the Tesseract engine."""
kind: Literal["tesserocr"] = "tesserocr" kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"] lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None path: Optional[str] = None
@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
class OcrMacOptions(OcrOptions): class OcrMacOptions(OcrOptions):
"""Options for the Mac OCR engine."""
kind: Literal["ocrmac"] = "ocrmac" kind: Literal["ocrmac"] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"] lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
recognition: str = "accurate" recognition: str = "accurate"
@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
class PipelineOptions(BaseModel): class PipelineOptions(BaseModel):
"""Base pipeline options."""
create_legacy_output: bool = ( create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling True # This defautl will be set to False on a future version of docling
) )
class PdfPipelineOptions(PipelineOptions): class PdfPipelineOptions(PipelineOptions):
"""Options for the PDF pipeline."""
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text

View File

@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
DocumentStream,
ErrorItem,
InputFormat,
)
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,
InputDocument, InputDocument,
@ -23,6 +29,7 @@ from docling.datamodel.document import (
) )
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings from docling.datamodel.settings import DocumentLimits, settings
from docling.exceptions import ConversionError
from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
_format_to_default_options = { def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XLSX: FormatOption( format_to_default_options = {
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend InputFormat.XLSX: FormatOption(
), pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
InputFormat.DOCX: FormatOption( ),
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend InputFormat.DOCX: FormatOption(
), pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
InputFormat.PPTX: FormatOption( ),
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend InputFormat.PPTX: FormatOption(
), pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
InputFormat.MD: FormatOption( ),
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend InputFormat.MD: FormatOption(
), pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
InputFormat.ASCIIDOC: FormatOption( ),
pipeline_cls=SimplePipeline, backend=AsciiDocBackend InputFormat.ASCIIDOC: FormatOption(
), pipeline_cls=SimplePipeline, backend=AsciiDocBackend
InputFormat.HTML: FormatOption( ),
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend InputFormat.HTML: FormatOption(
), pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
InputFormat.IMAGE: FormatOption( ),
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend InputFormat.IMAGE: FormatOption(
), pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
InputFormat.PDF: FormatOption( ),
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend InputFormat.PDF: FormatOption(
), pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
} ),
}
if (options := format_to_default_options.get(format)) is not None:
return options
else:
raise RuntimeError(f"No default options configured for {format}")
class DocumentConverter: class DocumentConverter:
@ -121,36 +133,26 @@ class DocumentConverter:
allowed_formats: Optional[List[InputFormat]] = None, allowed_formats: Optional[List[InputFormat]] = None,
format_options: Optional[Dict[InputFormat, FormatOption]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None,
): ):
self.allowed_formats = allowed_formats self.allowed_formats = (
self.format_to_options = format_options allowed_formats if allowed_formats is not None else [e for e in InputFormat]
)
if self.allowed_formats is None: self.format_to_options = {
# if self.format_to_options is not None: format: (
# self.allowed_formats = self.format_to_options.keys() _get_default_option(format=format)
# else: if (custom_option := (format_options or {}).get(format)) is None
self.allowed_formats = [e for e in InputFormat] # all formats else custom_option
)
if self.format_to_options is None: for format in self.allowed_formats
self.format_to_options = _format_to_default_options }
else:
for f in self.allowed_formats:
if f not in self.format_to_options.keys():
_log.debug(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f]
remove_keys = []
for f in self.format_to_options.keys():
if f not in self.allowed_formats:
remove_keys.append(f)
for f in remove_keys:
self.format_to_options.pop(f)
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
def initialize_pipeline(self, format: InputFormat): def initialize_pipeline(self, format: InputFormat):
"""Initialize the conversion pipeline for the selected format.""" """Initialize the conversion pipeline for the selected format."""
self._get_pipeline(doc_format=format) pipeline = self._get_pipeline(doc_format=format)
if pipeline is None:
raise ConversionError(
f"No pipeline could be initialized for format {format}"
)
@validate_call(config=ConfigDict(strict=True)) @validate_call(config=ConfigDict(strict=True))
def convert( def convert(
@ -186,22 +188,28 @@ class DocumentConverter:
limits=limits, limits=limits,
) )
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error) conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
had_result = False
for conv_res in conv_res_iter: for conv_res in conv_res_iter:
had_result = True
if raises_on_error and conv_res.status not in { if raises_on_error and conv_res.status not in {
ConversionStatus.SUCCESS, ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS, ConversionStatus.PARTIAL_SUCCESS,
}: }:
raise RuntimeError( raise ConversionError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
) )
else: else:
yield conv_res yield conv_res
if not had_result and raises_on_error:
raise ConversionError(
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
)
def _convert( def _convert(
self, conv_input: _DocumentConversionInput, raises_on_error: bool self, conv_input: _DocumentConversionInput, raises_on_error: bool
) -> Iterator[ConversionResult]: ) -> Iterator[ConversionResult]:
assert self.format_to_options is not None
start_time = time.monotonic() start_time = time.monotonic()
for input_batch in chunkify( for input_batch in chunkify(
@ -223,27 +231,22 @@ class DocumentConverter:
): ):
elapsed = time.monotonic() - start_time elapsed = time.monotonic() - start_time
start_time = time.monotonic() start_time = time.monotonic()
_log.info(
if item is not None: f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
_log.info( )
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." yield item
)
yield item
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]: def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
assert self.format_to_options is not None
fopt = self.format_to_options.get(doc_format) fopt = self.format_to_options.get(doc_format)
if fopt is None: if fopt is None:
raise RuntimeError(f"Could not get pipeline for {doc_format}") return None
else: else:
pipeline_class = fopt.pipeline_cls pipeline_class = fopt.pipeline_cls
pipeline_options = fopt.pipeline_options pipeline_options = fopt.pipeline_options
assert pipeline_options is not None if pipeline_options is None:
return None
# TODO this will ignore if different options have been defined for the same pipeline class. # TODO this will ignore if different options have been defined for the same pipeline class.
if ( if (
pipeline_class not in self.initialized_pipelines pipeline_class not in self.initialized_pipelines
@ -257,11 +260,26 @@ class DocumentConverter:
def _process_document( def _process_document(
self, in_doc: InputDocument, raises_on_error: bool self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]: ) -> ConversionResult:
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) valid = (
self.allowed_formats is not None and in_doc.format in self.allowed_formats
)
if valid:
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
else:
error_message = f"File format not allowed: {in_doc.file}"
if raises_on_error:
raise ConversionError(error_message)
else:
error_item = ErrorItem(
component_type=DoclingComponentType.USER_INPUT,
module_name="",
error_message=error_message,
)
conv_res = ConversionResult(
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
)
return conv_res return conv_res
@ -270,26 +288,28 @@ class DocumentConverter:
) -> ConversionResult: ) -> ConversionResult:
if in_doc.valid: if in_doc.valid:
pipeline = self._get_pipeline(in_doc.format) pipeline = self._get_pipeline(in_doc.format)
if pipeline is None: # Can't find a default pipeline. Should this raise? if pipeline is not None:
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else:
if raises_on_error: if raises_on_error:
raise RuntimeError( raise ConversionError(
f"No pipeline could be initialized for {in_doc.file}." f"No pipeline could be initialized for {in_doc.file}."
) )
else: else:
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(
conv_res.status = ConversionStatus.FAILURE input=in_doc,
return conv_res status=ConversionStatus.FAILURE,
)
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else: else:
if raises_on_error: if raises_on_error:
raise RuntimeError(f"Input document {in_doc.file} is not valid.") raise ConversionError(f"Input document {in_doc.file} is not valid.")
else: else:
# invalid doc or not of desired format # invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(
conv_res.status = ConversionStatus.FAILURE input=in_doc,
status=ConversionStatus.FAILURE,
)
# TODO add error log why it failed. # TODO add error log why it failed.
return conv_res return conv_res

6
docling/exceptions.py Normal file
View File

@ -0,0 +1,6 @@
class BaseError(RuntimeError):
pass
class ConversionError(BaseError):
pass

View File

@ -1,5 +1,7 @@
import csv
import io import io
import logging import logging
import os
import tempfile import tempfile
from subprocess import DEVNULL, PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Optional, Tuple from typing import Iterable, Optional, Tuple
@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
# _log.info(decoded_data) # _log.info(decoded_data)
# Read the TSV file generated by Tesseract # Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), sep="\t") df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
# Display the dataframe (optional) # Display the dataframe (optional)
# _log.info("df: ", df.head()) # _log.info("df: ", df.head())
@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
high_res_image = page._backend.get_page_image( high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect scale=self.scale, cropbox=ocr_rect
) )
try:
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(
suffix=".png", mode="w" suffix=".png", mode="w+b", delete=False
) as image_file: ) as image_file:
fname = image_file.name fname = image_file.name
high_res_image.save(fname) high_res_image.save(image_file)
df = self._run_tesseract(fname) df = self._run_tesseract(fname)
finally:
if os.path.exists(fname):
os.remove(fname)
# _log.info(df) # _log.info(df)

View File

@ -0,0 +1,52 @@
# Docling Document
This is an automatic generated API reference of the DoclingDocument type.
::: docling_core.types.doc
handler: python
options:
members:
- DoclingDocument
- DocumentOrigin
- DocItem
- DocItemLabel
- ProvenanceItem
- GroupItem
- GroupLabel
- NodeItem
- PageItem
- FloatingItem
- TextItem
- TableItem
- TableCell
- TableData
- TableCellLabel
- KeyValueItem
- SectionHeaderItem
- PictureItem
- ImageRef
- PictureClassificationClass
- PictureClassificationData
- RefItem
- BoundingBox
- CoordOrigin
- ImageRefMode
- Size
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
show_root_toc_entry: true
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
show_labels: false
signature_crossrefs: true
summary: true

View File

@ -0,0 +1,38 @@
# Document converter
This is an automatic generated API reference of the main components of Docling.
::: docling.document_converter
handler: python
options:
members:
- DocumentConverter
- ConversionResult
- ConversionStatus
- FormatOption
- InputFormat
- PdfFormatOption
- ImageFormatOption
- StandardPdfPipeline
- WordFormatOption
- PowerpointFormatOption
- MarkdownFormatOption
- AsciiDocFormatOption
- HTMLFormatOption
- SimplePipeline
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
signature_crossrefs: true
summary: true

View File

@ -0,0 +1,36 @@
# Pipeline options
Pipeline options allow to customize the execution of the models during the conversion pipeline.
This includes options for the OCR engines, the table model as well as enrichment options which
can be enabled with `do_xyz = True`.
This is an automatic generated API reference of the all the pipeline options available in Docling.
::: docling.datamodel.pipeline_options
handler: python
options:
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
signature_crossrefs: true
summary: true
<!-- ::: docling.document_converter.DocumentConverter
handler: python
options:
show_if_no_docstring: true
show_submodules: true -->

Binary file not shown.

After

Width:  |  Height:  |  Size: 233 KiB

Binary file not shown.

147
docs/faq.md Normal file
View File

@ -0,0 +1,147 @@
# FAQ
This is a collection of FAQ collected from the user questions on <https://github.com/DS4SD/docling/discussions>.
??? question "Is Python 3.13 supported?"
### Is Python 3.13 supported?
Full support for Python 3.13 is currently waiting for [pytorch](https://github.com/pytorch/pytorch).
At the moment, no release has full support, but nightly builds are available. Docling was tested on Python 3.13 with the following steps:
```sh
# Create a python 3.13 virtualenv
python3.13 -m venv venv
source ./venv/bin/activate
# Install torch nightly builds, see https://pytorch.org/
pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
# Install docling
pip3 install docling
# Run docling
docling --no-ocr https://arxiv.org/pdf/2408.09869
```
_Note: we are disabling OCR since easyocr and the nightly torch builds have some conflicts._
Source: Issue [#136](https://github.com/DS4SD/docling/issues/136)
??? question "Install conflicts with numpy (python 3.13)"
### Install conflicts with numpy (python 3.13)
When using `docling-ibm-models>=2.0.7` and `deepsearch-glm>=0.26.2` these issues should not show up anymore.
Docling supports numpy versions `>=1.24.4,<3.0.0` which should match all usages.
**For older versions**
This has been observed installing docling and langchain via poetry.
```
...
Thus, docling (>=2.7.0,<3.0.0) requires numpy (>=1.26.4,<2.0.0).
So, because ... depends on both numpy (>=2.0.2,<3.0.0) and docling (^2.7.0), version solving failed.
```
Numpy is only adding Python 3.13 support starting in some 2.x.y version. In order to prepare for 3.13, Docling depends on a 2.x.y for 3.13, otherwise depending an 1.x.y version. If you are allowing 3.13 in your pyproject.toml, Poetry will try to find some way to reconcile Docling's numpy version for 3.13 (some 2.x.y) with LangChain's version for that (some 1.x.y) — leading to the error above.
Check if Python 3.13 is among the Python versions allowed by your pyproject.toml and if so, remove it and try again.
E.g., if you have python = "^3.10", use python = ">=3.10,<3.13" instead.
If you want to retain compatibility with python 3.9-3.13, you can also use a selector in pyproject.toml similar to the following
```toml
numpy = [
{ version = "^2.1.0", markers = 'python_version >= "3.13"' },
{ version = "^1.24.4", markers = 'python_version < "3.13"' },
]
```
Source: Issue [#283](https://github.com/DS4SD/docling/issues/283#issuecomment-2465035868)
??? question "Are text styles (bold, underline, etc) supported?"
### Are text styles (bold, underline, etc) supported?
Currently text styles are not supported in the `DoclingDocument` format.
If you are interest in contributing this feature, please open a discussion topic to brainstorm on the design.
_Note: this is not a simple topic_
??? question "How do I run completely offline?"
### How do I run completely offline?
Docling is not using any remote service, hence it can run in completely isolated air-gapped environments.
The only requirement is pointing the Docling runtime to the location where the model artifacts have been stored.
For example
```py
pipeline_options = PdfPipelineOptions(artifacts_path="your location")
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
```
Source: Issue [#326](https://github.com/DS4SD/docling/issues/326)
??? question " Which model weights are needed to run Docling?"
### Which model weights are needed to run Docling?
Model weights are needed for the AI models used in the PDF pipeline. Other document types (docx, pptx, etc) do not have any such requirement.
For processing PDF documents, Docling requires the model weights from <https://huggingface.co/ds4sd/docling-models>.
When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/DS4SD/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior.
??? question "SSL error downloading model weights"
### SSL error downloading model weights
```
URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>
```
Similar SSL download errors have been observed by some users. This happens when model weights are fetched from Hugging Face.
The error could happen when the python environment doesn't have an up-to-date list of trusted certificates.
Possible solutions were
- Update to the latest version of [certifi](https://pypi.org/project/certifi/), i.e. `pip install --upgrade certifi`
- Use [pip-system-certs](https://pypi.org/project/pip-system-certs/) to use the latest trusted certificates on your system.
??? question "Which OCR languages are supported?"
### Which OCR languages are supported?
Docling supports multiple OCR engine, each one has its own list of supported languages.
Here is a collection of links to the original OCR engine's documentation listing the OCR languages.
- [EasyOCR](https://www.jaided.ai/easyocr/)
- [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
- [RapidOCR](https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/)
- [Mac OCR](https://github.com/straussmaximilian/ocrmac/tree/main?tab=readme-ov-file#example-select-language-preference)
Setting the OCR language in Docling is done via the OCR pipeline options:
```py
from docling.datamodel.pipeline_options import PdfPipelineOptions
pipeline_options = PdfPipelineOptions()
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR
```

View File

@ -1,5 +1,3 @@
# Docling
<p align="center"> <p align="center">
<img loading="lazy" alt="Docling" src="assets/docling_processing.png" width="100%" /> <img loading="lazy" alt="Docling" src="assets/docling_processing.png" width="100%" />
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@ -23,7 +21,7 @@ Docling parses documents and exports them to the desired format with ease and sp
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
* 🧩 Unified, expressive [DoclingDocument](./concepts/docling_document.md) representation format * 🧩 Unified, expressive [DoclingDocument](./concepts/docling_document.md) representation format
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
* 🔍 OCR support for scanned PDFs * 🔍 OCR support for scanned PDFs
* 💻 Simple and convenient CLI * 💻 Simple and convenient CLI

9
docs/integrations/bee.md Normal file
View File

@ -0,0 +1,9 @@
Docling is available as an extraction backend in the [Bee][github] framework.
- 💻 [Bee GitHub][github]
- 📖 [Bee Docs][docs]
- 📦 [Bee NPM][package]
[github]: https://github.com/i-am-bee
[docs]: https://i-am-bee.github.io/bee-agent-framework/
[package]: https://www.npmjs.com/package/bee-agent-framework

View File

@ -1 +1,6 @@
Use the navigation on the left to browse through Docling integrations with popular frameworks and tools. Use the navigation on the left to browse through Docling integrations with popular frameworks and tools.
<p align="center">
<img loading="lazy" alt="Docling" src="../assets/docling_ecosystem.png" width="100%" />
</p>

View File

@ -0,0 +1,17 @@
Docling is powering document processing in [InstructLab](https://instructlab.ai/),
enabling users to unlock the knowledge hidden in documents and present it to
InstructLab's fine-tuning for aligning AI models to the user's specific data.
More details can be found in this [blog post][blog].
- 🏠 [InstructLab Home][home]
- 💻 [InstructLab GitHub][github]
- 🧑🏻‍💻 [InstructLab UI][ui]
- 📖 [InstructLab Docs][docs]
<!-- - 📝 [Blog post]() -->
[home]: https://instructlab.ai
[github]: https://github.com/instructlab
[ui]: https://ui.instructlab.ai/
[docs]: https://docs.instructlab.ai/
[blog]: https://www.redhat.com/en/blog/docling-missing-document-processing-companion-generative-ai

View File

@ -0,0 +1,9 @@
Docling is available in [Prodigy][home] as a [Prodigy-PDF plugin][plugin] recipe.
- 🌐 [Prodigy Home][home]
- 🔌 [Prodigy-PDF Plugin][plugin]
- 🧑🏽‍🍳 [pdf-spans.manual Recipe][recipe]
[home]: https://prodi.gy/
[plugin]: https://prodi.gy/docs/plugins#pdf
[recipe]: https://prodi.gy/docs/plugins#pdf-spans.manual

View File

@ -1,3 +1,5 @@
# spaCy
Docling is available in [spaCy](https://spacy.io/) as the "SpaCy Layout" plugin: Docling is available in [spaCy](https://spacy.io/) as the "SpaCy Layout" plugin:
- 💻 [SpacyLayout GitHub][github] - 💻 [SpacyLayout GitHub][github]

View File

@ -1,5 +1,7 @@
{% extends "base.html" %} {% extends "base.html" %}
{#
{% block announce %} {% block announce %}
<p>🎉 Docling has gone v2! <a href="{{ 'v2' | url }}">Check out</a> what's new and how to get started!</p> <p>🎉 Docling has gone v2! <a href="{{ 'v2' | url }}">Check out</a> what's new and how to get started!</p>
{% endblock %} {% endblock %}
#}

View File

@ -52,11 +52,12 @@ theme:
- search.suggest - search.suggest
- toc.follow - toc.follow
nav: nav:
- Get started: - Home:
- Home: index.md - "🦆 Docling": index.md
- Installation: installation.md - Installation: installation.md
- Usage: usage.md - Usage: usage.md
- CLI: cli.md - CLI: cli.md
- FAQ: faq.md
- Docling v2: v2.md - Docling v2: v2.md
- Concepts: - Concepts:
- Concepts: concepts/index.md - Concepts: concepts/index.md
@ -85,14 +86,19 @@ nav:
# - CLI: examples/cli.md # - CLI: examples/cli.md
- Integrations: - Integrations:
- Integrations: integrations/index.md - Integrations: integrations/index.md
- "🐝 Bee": integrations/bee.md
- "Data Prep Kit": integrations/data_prep_kit.md - "Data Prep Kit": integrations/data_prep_kit.md
- "DocETL": integrations/docetl.md - "DocETL": integrations/docetl.md
- "🐶 InstructLab": integrations/instructlab.md
- "Kotaemon": integrations/kotaemon.md - "Kotaemon": integrations/kotaemon.md
- "LlamaIndex 🦙": integrations/llamaindex.md - "🦙 LlamaIndex": integrations/llamaindex.md
- "Prodigy": integrations/prodigy.md
- "spaCy": integrations/spacy.md - "spaCy": integrations/spacy.md
# - "LangChain 🦜🔗": integrations/langchain.md # - "LangChain 🦜🔗": integrations/langchain.md
# - API reference: - API reference:
# - API reference: api_reference/index.md - Document Converter: api_reference/document_converter.md
- Pipeline options: api_reference/pipeline_options.md
- Docling Document: api_reference/docling_document.md
markdown_extensions: markdown_extensions:
- pymdownx.superfences - pymdownx.superfences
@ -108,12 +114,15 @@ markdown_extensions:
plugins: plugins:
- search - search
- mkdocs-jupyter - mkdocs-jupyter
# - mkdocstrings: - mkdocstrings:
# default_handler: python default_handler: python
# options: options:
# preload_modules: extensions:
# - docling - griffe_pydantic:
# - docling_core schema: true
preload_modules:
- docling
- docling_core
extra_css: extra_css:
- stylesheets/extra.css - stylesheets/extra.css

1470
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "docling" name = "docling"
version = "2.8.0" # DO NOT EDIT, updated automatically version = "2.8.3" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT" license = "MIT"
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
###################### ######################
python = "^3.9" python = "^3.9"
pydantic = ">=2.0.0,<2.10" pydantic = ">=2.0.0,<2.10"
docling-core = "^2.5.1" docling-core = "^2.6.1"
docling-ibm-models = "^2.0.6" docling-ibm-models = "^2.0.6"
deepsearch-glm = "^0.26.1" deepsearch-glm = "^0.26.1"
filetype = "^1.2.0" filetype = "^1.2.0"
@ -40,7 +40,6 @@ docling-parse = "^2.0.5"
certifi = ">=2024.7.4" certifi = ">=2024.7.4"
rtree = "^1.3.0" rtree = "^1.3.0"
scipy = "^1.6.0" scipy = "^1.6.0"
pyarrow = "^16.1.0"
typer = "^0.12.5" typer = "^0.12.5"
python-docx = "^1.1.2" python-docx = "^1.1.2"
python-pptx = "^1.0.2" python-pptx = "^1.0.2"
@ -81,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114"
mkdocs-material = "^9.5.40" mkdocs-material = "^9.5.40"
mkdocs-jupyter = "^0.25.0" mkdocs-jupyter = "^0.25.0"
mkdocs-click = "^0.8.1" mkdocs-click = "^0.8.1"
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
griffe-pydantic = "^1.1.0"
[tool.poetry.group.examples.dependencies] [tool.poetry.group.examples.dependencies]
datasets = "^2.21.0" datasets = "^2.21.0"
@ -89,10 +90,13 @@ langchain-huggingface = "^0.0.3"
langchain-milvus = "^0.1.4" langchain-milvus = "^0.1.4"
langchain-text-splitters = "^0.2.4" langchain-text-splitters = "^0.2.4"
[tool.poetry.group.constraints]
optional = true
[tool.poetry.group.constraints.dependencies] [tool.poetry.group.constraints.dependencies]
numpy = [ numpy = [
{ version = "^2.1.0", markers = 'python_version >= "3.13"' }, { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
{ version = "^1.24.4", markers = 'python_version < "3.13"' }, { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
] ]
[tool.poetry.group.mac_intel] [tool.poetry.group.mac_intel]

View File

@ -10,7 +10,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE = True GENERATE = False
def get_pdf_path(): def get_pdf_path():

View File

@ -0,0 +1,45 @@
from io import BytesIO
from pathlib import Path
import pytest
from docling.datamodel.base_models import ConversionStatus, DocumentStream
from docling.document_converter import ConversionError, DocumentConverter
def get_pdf_path():
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
return pdf_path
@pytest.fixture
def converter():
converter = DocumentConverter()
return converter
def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
result = converter.convert(
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
)
assert result.status == ConversionStatus.SKIPPED
def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
with pytest.raises(ConversionError):
converter.convert(
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
raises_on_error=True,
)
def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
assert result.status == ConversionStatus.FAILURE
def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
with pytest.raises(ConversionError):
converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)