Merge branch 'main' into nli/performance_main

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-12-13 13:58:37 +01:00
commit 6209cf3bc5
17 changed files with 575 additions and 884 deletions

View File

@ -1,3 +1,30 @@
## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12
### Feature
* Add timeout limit to document parsing job. DS4SD#270 ([#552](https://github.com/DS4SD/docling/issues/552)) ([`3da166e`](https://github.com/DS4SD/docling/commit/3da166eafa3c119de961510341cb92397652c222))
### Fix
* Do not import python modules from deepsearch-glm ([#569](https://github.com/DS4SD/docling/issues/569)) ([`aee9c0b`](https://github.com/DS4SD/docling/commit/aee9c0b324a07190ad03ad3a6266e76c465d4cdf))
* Handle no result from RapidOcr reader ([#558](https://github.com/DS4SD/docling/issues/558)) ([`f45499c`](https://github.com/DS4SD/docling/commit/f45499ce9349fe55538dfb36d74c395e9193d9b1))
* Make enum serializable with human-readable value ([#555](https://github.com/DS4SD/docling/issues/555)) ([`a7df337`](https://github.com/DS4SD/docling/commit/a7df337654fa5fa7633af8740fb5e4cc4a06f250))
### Documentation
* Update chunking usage docs, minor reorg ([#550](https://github.com/DS4SD/docling/issues/550)) ([`d0c9e8e`](https://github.com/DS4SD/docling/commit/d0c9e8e508d7edef5e733be6cdea2cea0a9a0695))
## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
### Feature
* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170))
### Fix
* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2))
* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96))
## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
### Feature

View File

@ -29,8 +29,10 @@ from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrEngine,
OcrMacOptions,
OcrOptions,
PdfBackend,
PdfPipelineOptions,
RapidOcrOptions,
TableFormerMode,
@ -70,22 +72,6 @@ def version_callback(value: bool):
raise typer.Exit()
# Define an enum for the backend options
class PdfBackend(str, Enum):
PYPDFIUM2 = "pypdfium2"
DLPARSE_V1 = "dlparse_v1"
DLPARSE_V2 = "dlparse_v2"
# Define an enum for the ocr engines
class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
@ -266,6 +252,13 @@ def convert(
help="Show version information.",
),
] = None,
document_timeout: Annotated[
Optional[float],
typer.Option(
...,
help="The timeout for processing each document, in seconds.",
),
] = None,
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
device: Annotated[
AcceleratorDevice, typer.Option(..., help="Accelerator device")
@ -355,6 +348,7 @@ def convert(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
document_timeout=document_timeout,
)
pipeline_options.table_structure_options.do_cell_matching = (
True # do_cell_matching

View File

@ -19,12 +19,12 @@ if TYPE_CHECKING:
class ConversionStatus(str, Enum):
PENDING = auto()
STARTED = auto()
FAILURE = auto()
SUCCESS = auto()
PARTIAL_SUCCESS = auto()
SKIPPED = auto()
PENDING = "pending"
STARTED = "started"
FAILURE = "failure"
SUCCESS = "success"
PARTIAL_SUCCESS = "partial_success"
SKIPPED = "skipped"
class InputFormat(str, Enum):
@ -89,15 +89,15 @@ MimeTypeToFormat = {
class DocInputType(str, Enum):
PATH = auto()
STREAM = auto()
PATH = "path"
STREAM = "stream"
class DoclingComponentType(str, Enum):
DOCUMENT_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
USER_INPUT = auto()
DOCUMENT_BACKEND = "document_backend"
MODEL = "model"
DOC_ASSEMBLER = "doc_assembler"
USER_INPUT = "user_input"
class ErrorItem(BaseModel):

View File

@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
from typing_extensions import deprecated
@ -191,259 +192,7 @@ class ConversionResult(BaseModel):
@property
@deprecated("Use document instead.")
def legacy_document(self):
reverse_label_mapping = {
DocItemLabel.CAPTION.value: "Caption",
DocItemLabel.FOOTNOTE.value: "Footnote",
DocItemLabel.FORMULA.value: "Formula",
DocItemLabel.LIST_ITEM.value: "List-item",
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
DocItemLabel.PAGE_HEADER.value: "Page-header",
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER.value: "Section-header",
DocItemLabel.TABLE.value: "Table",
DocItemLabel.TEXT.value: "Text",
DocItemLabel.TITLE.value: "Title",
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
DocItemLabel.CODE.value: "Code",
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
DocItemLabel.FORM.value: "Form",
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
DocItemLabel.PARAGRAPH.value: "paragraph",
}
title = ""
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
page=p.page_no,
model="default",
)
for p in self.document.pages.values()
]
file_info = DsFileInfoObject(
filename=self.input.file.name,
document_hash=self.input.document_hash,
num_pages=self.input.page_count,
page_hashes=page_hashes,
)
main_text = []
tables = []
figures = []
equations = []
footnotes = []
page_headers = []
page_footers = []
embedded_captions = set()
for ix, (item, level) in enumerate(
self.document.iterate_items(self.document.body)
):
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
caption = item.caption_text(self.document)
if caption:
embedded_captions.add(caption)
for item, level in self.document.iterate_items():
if isinstance(item, DocItem):
item_type = item.label
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
if isinstance(item, ListItem) and item.marker:
text = f"{item.marker} {item.text}"
else:
text = item.text
# Can be empty.
prov = [
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(item.text)],
)
for p in item.prov
]
main_text.append(
BaseText(
text=text,
obj_type=layout_label_to_ds_type.get(item.label),
name=reverse_label_mapping[item.label],
prov=prov,
)
)
# skip captions of they are embedded in the actual
# floating object
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
continue
elif isinstance(item, TableItem) and item.data:
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
)
for j in range(item.data.num_cols)
]
for i in range(item.data.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in item.data.table_cells:
for i in range(
min(cell.start_row_offset_idx, item.data.num_rows),
min(cell.end_row_offset_idx, item.data.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, item.data.num_cols),
min(cell.end_col_offset_idx, item.data.num_cols),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
elif cell.row_section:
celltype = "row_section"
def make_spans(cell):
for rspan in range(
min(
cell.start_row_offset_idx,
item.data.num_rows,
),
min(
cell.end_row_offset_idx, item.data.num_rows
),
):
for cspan in range(
min(
cell.start_col_offset_idx,
item.data.num_cols,
),
min(
cell.end_col_offset_idx,
item.data.num_cols,
),
):
yield [rspan, cspan]
spans = list(make_spans(cell))
table_data[i][j] = GlmTableCell(
text=cell.text,
bbox=(
cell.bbox.as_tuple()
if cell.bbox is not None
else None
), # check if this is bottom-left
spans=spans,
obj_type=celltype,
col=j,
row=i,
row_header=cell.row_header,
row_section=cell.row_section,
col_header=cell.column_header,
row_span=[
cell.start_row_offset_idx,
cell.end_row_offset_idx,
],
col_span=[
cell.start_col_offset_idx,
cell.end_col_offset_idx,
],
)
# Compute the caption
caption = item.caption_text(self.document)
tables.append(
DsSchemaTable(
text=caption,
num_cols=item.data.num_cols,
num_rows=item.data.num_rows,
obj_type=layout_label_to_ds_type.get(item.label),
data=table_data,
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, 0],
)
for p in item.prov
],
)
)
elif isinstance(item, PictureItem):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
# Compute the caption
caption = item.caption_text(self.document)
figures.append(
Figure(
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(caption)],
)
for p in item.prov
],
obj_type=layout_label_to_ds_type.get(item.label),
text=caption,
# data=[[]],
)
)
page_dimensions = [
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
for p in self.document.pages.values()
]
ds_doc = DsDocument(
name=title,
description=desc,
file_info=file_info,
main_text=main_text,
equations=equations,
footnotes=footnotes,
page_headers=page_headers,
page_footers=page_footers,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
)
return ds_doc
return docling_document_to_legacy(self.document)
class _DummyBackend(AbstractDocumentBackend):

View File

@ -190,12 +190,33 @@ class OcrMacOptions(OcrOptions):
)
# Define an enum for the backend options
class PdfBackend(str, Enum):
"""Enum of valid PDF backends."""
PYPDFIUM2 = "pypdfium2"
DLPARSE_V1 = "dlparse_v1"
DLPARSE_V2 = "dlparse_v2"
# Define an enum for the ocr engines
class OcrEngine(str, Enum):
"""Enum of valid OCR engines."""
EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"
class PipelineOptions(BaseModel):
"""Base pipeline options."""
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
True # This default will be set to False on a future version of docling
)
document_timeout: Optional[float] = None
accelerator_options: AcceleratorOptions = AcceleratorOptions()

View File

@ -3,8 +3,7 @@ import random
from pathlib import Path
from typing import List, Union
from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from deepsearch_glm.andromeda_nlp import nlp_model
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import (
@ -49,11 +48,7 @@ class GlmModel:
def __init__(self, options: GlmOptions):
self.options = options
if self.options.model_names != "":
load_pretrained_nlp_models()
self.model = init_nlp_model(
model_names=self.options.model_names, loglevel="ERROR"
)
self.model = nlp_model(loglevel="error", text_ordering=True)
def _to_legacy_document(self, conv_res) -> DsDocument:
title = ""

View File

@ -97,24 +97,25 @@ class RapidOcrModel(BaseOcrModel):
del high_res_image
del im
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
if result is not None:
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)

View File

@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
total_elapsed_time = 0.0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
for i in range(0, conv_res.input.page_count):
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
for page_batch in chunkify(
conv_res.pages, settings.perf.page_batch_size
):
start_pb_time = time.time()
start_batch_time = time.monotonic()
# 1. Initialise the page resources
init_pages = map(
@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
end_batch_time = time.monotonic()
total_elapsed_time += end_batch_time - start_batch_time
if (
self.pipeline_options.document_timeout is not None
and total_elapsed_time > self.pipeline_options.document_timeout
):
_log.warning(
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
)
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
break
_log.debug(
f"Finished converting page batch time={end_batch_time:.3f}"
)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE

View File

@ -10,7 +10,7 @@ For each document format, the *document converter* knows which format-specific *
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a *chunker*.
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md).
For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).

View File

@ -1,4 +1,4 @@
# CLI Reference
# CLI reference
This page provides documentation for our command line tools.
@ -6,4 +6,4 @@ This page provides documentation for our command line tools.
:module: docling.cli.main
:command: click_app
:prog_name: docling
:style: table
:style: table

View File

@ -22,9 +22,7 @@ A simple example would look like this:
docling https://arxiv.org/pdf/2206.01062
```
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./cli.md).
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
### Advanced options
@ -130,29 +128,37 @@ You can limit the CPU threads used by Docling by setting the environment variabl
## Chunking
You can perform a hierarchy-aware chunking of a Docling document as follows:
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
`HybridChunker`, as shown below (for more details check out
[this example](examples/hybrid_chunking.ipynb)):
```python
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
from docling.chunking import HybridChunker
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
doc = conv_res.document
chunks = list(HierarchicalChunker().chunk(doc))
print(chunks[30])
chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5") # set tokenizer as needed
chunk_iter = chunker.chunk(doc)
```
An example chunk would look like this:
```python
print(list(chunk_iter)[11])
# {
# "text": "Lately, new types of ML models for document-layout analysis have emerged [...]",
# "text": "In this paper, we present the DocLayNet dataset. [...]",
# "meta": {
# "doc_items": [{
# "self_ref": "#/texts/40",
# "self_ref": "#/texts/28",
# "label": "text",
# "prov": [{
# "page_no": 2,
# "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...},
# }]
# }],
# "headings": ["2 RELATED WORK"],
# "bbox": {"l": 53.29, "t": 287.14, "r": 295.56, "b": 212.37, ...},
# }], ...,
# }, ...],
# "headings": ["1 INTRODUCTION"],
# }
# }
```

View File

@ -56,7 +56,6 @@ nav:
- "Docling": index.md
- Installation: installation.md
- Usage: usage.md
- CLI: cli.md
- FAQ: faq.md
- Docling v2: v2.md
- Concepts:
@ -77,15 +76,12 @@ nav:
- "Multimodal export": examples/export_multimodal.py
- "Force full page OCR": examples/full_page_ocr.py
- "Accelerator options": examples/run_with_acclerators.py
- Chunking:
- "Hybrid chunking": examples/hybrid_chunking.ipynb
- RAG / QA:
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
- "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
- Chunking:
- "Hybrid chunking": examples/hybrid_chunking.ipynb
# - Chunking: examples/chunking.md
# - CLI:
# - CLI: examples/cli.md
- Integrations:
- Integrations: integrations/index.md
- "🐝 Bee": integrations/bee.md
@ -100,10 +96,13 @@ nav:
- "spaCy": integrations/spacy.md
- "txtai": integrations/txtai.md
# - "LangChain 🦜🔗": integrations/langchain.md
- API reference:
- Document Converter: api_reference/document_converter.md
- Pipeline options: api_reference/pipeline_options.md
- Docling Document: api_reference/docling_document.md
- Reference:
- Python API:
- Document Converter: reference/document_converter.md
- Pipeline options: reference/pipeline_options.md
- Docling Document: reference/docling_document.md
- CLI:
- CLI reference: reference/cli.md
markdown_extensions:
- pymdownx.superfences

975
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
version = "2.9.0" # DO NOT EDIT, updated automatically
version = "2.11.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT"
@ -25,11 +25,11 @@ packages = [{include = "docling"}]
# actual dependencies:
######################
python = "^3.9"
docling-ibm-models = { git = "ssh://git@github.com/DS4SD/docling-ibm-models.git", branch = "nli/performance" }
deepsearch-glm = "^1.0.0"
docling-parse = "^3.0.0"
docling-core = { version = "^2.9.0", extras = ["chunking"] }
pydantic = "^2.0.0"
docling-ibm-models = "^3.0.0"
deepsearch-glm = "^1.0.0"
docling-parse = "^3.0.0"
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"