mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Rebase from main
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
4aecf689aa
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
|||||||
|
## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2))
|
||||||
|
* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96))
|
||||||
|
|
||||||
## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
|
## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -29,8 +29,10 @@ from docling.datamodel.pipeline_options import (
|
|||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
|
OcrEngine,
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
|
PdfBackend,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
RapidOcrOptions,
|
RapidOcrOptions,
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
@ -70,22 +72,6 @@ def version_callback(value: bool):
|
|||||||
raise typer.Exit()
|
raise typer.Exit()
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the backend options
|
|
||||||
class PdfBackend(str, Enum):
|
|
||||||
PYPDFIUM2 = "pypdfium2"
|
|
||||||
DLPARSE_V1 = "dlparse_v1"
|
|
||||||
DLPARSE_V2 = "dlparse_v2"
|
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the ocr engines
|
|
||||||
class OcrEngine(str, Enum):
|
|
||||||
EASYOCR = "easyocr"
|
|
||||||
TESSERACT_CLI = "tesseract_cli"
|
|
||||||
TESSERACT = "tesseract"
|
|
||||||
OCRMAC = "ocrmac"
|
|
||||||
RAPIDOCR = "rapidocr"
|
|
||||||
|
|
||||||
|
|
||||||
def export_documents(
|
def export_documents(
|
||||||
conv_results: Iterable[ConversionResult],
|
conv_results: Iterable[ConversionResult],
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
|
@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class ConversionStatus(str, Enum):
|
class ConversionStatus(str, Enum):
|
||||||
PENDING = auto()
|
PENDING = "pending"
|
||||||
STARTED = auto()
|
STARTED = "started"
|
||||||
FAILURE = auto()
|
FAILURE = "failure"
|
||||||
SUCCESS = auto()
|
SUCCESS = "success"
|
||||||
PARTIAL_SUCCESS = auto()
|
PARTIAL_SUCCESS = "partial_success"
|
||||||
SKIPPED = auto()
|
SKIPPED = "skipped"
|
||||||
|
|
||||||
|
|
||||||
class InputFormat(str, Enum):
|
class InputFormat(str, Enum):
|
||||||
@ -89,15 +89,15 @@ MimeTypeToFormat = {
|
|||||||
|
|
||||||
|
|
||||||
class DocInputType(str, Enum):
|
class DocInputType(str, Enum):
|
||||||
PATH = auto()
|
PATH = "path"
|
||||||
STREAM = auto()
|
STREAM = "stream"
|
||||||
|
|
||||||
|
|
||||||
class DoclingComponentType(str, Enum):
|
class DoclingComponentType(str, Enum):
|
||||||
DOCUMENT_BACKEND = auto()
|
DOCUMENT_BACKEND = "document_backend"
|
||||||
MODEL = auto()
|
MODEL = "model"
|
||||||
DOC_ASSEMBLER = auto()
|
DOC_ASSEMBLER = "doc_assembler"
|
||||||
USER_INPUT = auto()
|
USER_INPUT = "user_input"
|
||||||
|
|
||||||
|
|
||||||
class ErrorItem(BaseModel):
|
class ErrorItem(BaseModel):
|
||||||
|
@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
|
|||||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||||
from docling_core.utils.file import resolve_source_to_stream
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
|
from docling_core.utils.legacy import docling_document_to_legacy
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -191,259 +192,7 @@ class ConversionResult(BaseModel):
|
|||||||
@property
|
@property
|
||||||
@deprecated("Use document instead.")
|
@deprecated("Use document instead.")
|
||||||
def legacy_document(self):
|
def legacy_document(self):
|
||||||
reverse_label_mapping = {
|
return docling_document_to_legacy(self.document)
|
||||||
DocItemLabel.CAPTION.value: "Caption",
|
|
||||||
DocItemLabel.FOOTNOTE.value: "Footnote",
|
|
||||||
DocItemLabel.FORMULA.value: "Formula",
|
|
||||||
DocItemLabel.LIST_ITEM.value: "List-item",
|
|
||||||
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
|
||||||
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
|
||||||
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
|
||||||
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
|
||||||
DocItemLabel.TABLE.value: "Table",
|
|
||||||
DocItemLabel.TEXT.value: "Text",
|
|
||||||
DocItemLabel.TITLE.value: "Title",
|
|
||||||
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
|
||||||
DocItemLabel.CODE.value: "Code",
|
|
||||||
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
|
||||||
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
|
||||||
DocItemLabel.FORM.value: "Form",
|
|
||||||
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
|
||||||
DocItemLabel.PARAGRAPH.value: "paragraph",
|
|
||||||
}
|
|
||||||
|
|
||||||
title = ""
|
|
||||||
desc = DsDocumentDescription(logs=[])
|
|
||||||
|
|
||||||
page_hashes = [
|
|
||||||
PageReference(
|
|
||||||
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
|
||||||
page=p.page_no,
|
|
||||||
model="default",
|
|
||||||
)
|
|
||||||
for p in self.document.pages.values()
|
|
||||||
]
|
|
||||||
|
|
||||||
file_info = DsFileInfoObject(
|
|
||||||
filename=self.input.file.name,
|
|
||||||
document_hash=self.input.document_hash,
|
|
||||||
num_pages=self.input.page_count,
|
|
||||||
page_hashes=page_hashes,
|
|
||||||
)
|
|
||||||
|
|
||||||
main_text = []
|
|
||||||
tables = []
|
|
||||||
figures = []
|
|
||||||
equations = []
|
|
||||||
footnotes = []
|
|
||||||
page_headers = []
|
|
||||||
page_footers = []
|
|
||||||
|
|
||||||
embedded_captions = set()
|
|
||||||
for ix, (item, level) in enumerate(
|
|
||||||
self.document.iterate_items(self.document.body)
|
|
||||||
):
|
|
||||||
|
|
||||||
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
|
||||||
caption = item.caption_text(self.document)
|
|
||||||
if caption:
|
|
||||||
embedded_captions.add(caption)
|
|
||||||
|
|
||||||
for item, level in self.document.iterate_items():
|
|
||||||
if isinstance(item, DocItem):
|
|
||||||
item_type = item.label
|
|
||||||
|
|
||||||
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
|
||||||
|
|
||||||
if isinstance(item, ListItem) and item.marker:
|
|
||||||
text = f"{item.marker} {item.text}"
|
|
||||||
else:
|
|
||||||
text = item.text
|
|
||||||
|
|
||||||
# Can be empty.
|
|
||||||
prov = [
|
|
||||||
Prov(
|
|
||||||
bbox=p.bbox.as_tuple(),
|
|
||||||
page=p.page_no,
|
|
||||||
span=[0, len(item.text)],
|
|
||||||
)
|
|
||||||
for p in item.prov
|
|
||||||
]
|
|
||||||
main_text.append(
|
|
||||||
BaseText(
|
|
||||||
text=text,
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
name=reverse_label_mapping[item.label],
|
|
||||||
prov=prov,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# skip captions of they are embedded in the actual
|
|
||||||
# floating object
|
|
||||||
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif isinstance(item, TableItem) and item.data:
|
|
||||||
index = len(tables)
|
|
||||||
ref_str = f"#/tables/{index}"
|
|
||||||
main_text.append(
|
|
||||||
Ref(
|
|
||||||
name=reverse_label_mapping[item.label],
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
ref=ref_str,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialise empty table data grid (only empty cells)
|
|
||||||
table_data = [
|
|
||||||
[
|
|
||||||
TableCell(
|
|
||||||
text="",
|
|
||||||
# bbox=[0,0,0,0],
|
|
||||||
spans=[[i, j]],
|
|
||||||
obj_type="body",
|
|
||||||
)
|
|
||||||
for j in range(item.data.num_cols)
|
|
||||||
]
|
|
||||||
for i in range(item.data.num_rows)
|
|
||||||
]
|
|
||||||
|
|
||||||
# Overwrite cells in table data for which there is actual cell content.
|
|
||||||
for cell in item.data.table_cells:
|
|
||||||
for i in range(
|
|
||||||
min(cell.start_row_offset_idx, item.data.num_rows),
|
|
||||||
min(cell.end_row_offset_idx, item.data.num_rows),
|
|
||||||
):
|
|
||||||
for j in range(
|
|
||||||
min(cell.start_col_offset_idx, item.data.num_cols),
|
|
||||||
min(cell.end_col_offset_idx, item.data.num_cols),
|
|
||||||
):
|
|
||||||
celltype = "body"
|
|
||||||
if cell.column_header:
|
|
||||||
celltype = "col_header"
|
|
||||||
elif cell.row_header:
|
|
||||||
celltype = "row_header"
|
|
||||||
elif cell.row_section:
|
|
||||||
celltype = "row_section"
|
|
||||||
|
|
||||||
def make_spans(cell):
|
|
||||||
for rspan in range(
|
|
||||||
min(
|
|
||||||
cell.start_row_offset_idx,
|
|
||||||
item.data.num_rows,
|
|
||||||
),
|
|
||||||
min(
|
|
||||||
cell.end_row_offset_idx, item.data.num_rows
|
|
||||||
),
|
|
||||||
):
|
|
||||||
for cspan in range(
|
|
||||||
min(
|
|
||||||
cell.start_col_offset_idx,
|
|
||||||
item.data.num_cols,
|
|
||||||
),
|
|
||||||
min(
|
|
||||||
cell.end_col_offset_idx,
|
|
||||||
item.data.num_cols,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
yield [rspan, cspan]
|
|
||||||
|
|
||||||
spans = list(make_spans(cell))
|
|
||||||
table_data[i][j] = GlmTableCell(
|
|
||||||
text=cell.text,
|
|
||||||
bbox=(
|
|
||||||
cell.bbox.as_tuple()
|
|
||||||
if cell.bbox is not None
|
|
||||||
else None
|
|
||||||
), # check if this is bottom-left
|
|
||||||
spans=spans,
|
|
||||||
obj_type=celltype,
|
|
||||||
col=j,
|
|
||||||
row=i,
|
|
||||||
row_header=cell.row_header,
|
|
||||||
row_section=cell.row_section,
|
|
||||||
col_header=cell.column_header,
|
|
||||||
row_span=[
|
|
||||||
cell.start_row_offset_idx,
|
|
||||||
cell.end_row_offset_idx,
|
|
||||||
],
|
|
||||||
col_span=[
|
|
||||||
cell.start_col_offset_idx,
|
|
||||||
cell.end_col_offset_idx,
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Compute the caption
|
|
||||||
caption = item.caption_text(self.document)
|
|
||||||
|
|
||||||
tables.append(
|
|
||||||
DsSchemaTable(
|
|
||||||
text=caption,
|
|
||||||
num_cols=item.data.num_cols,
|
|
||||||
num_rows=item.data.num_rows,
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
data=table_data,
|
|
||||||
prov=[
|
|
||||||
Prov(
|
|
||||||
bbox=p.bbox.as_tuple(),
|
|
||||||
page=p.page_no,
|
|
||||||
span=[0, 0],
|
|
||||||
)
|
|
||||||
for p in item.prov
|
|
||||||
],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(item, PictureItem):
|
|
||||||
index = len(figures)
|
|
||||||
ref_str = f"#/figures/{index}"
|
|
||||||
main_text.append(
|
|
||||||
Ref(
|
|
||||||
name=reverse_label_mapping[item.label],
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
ref=ref_str,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Compute the caption
|
|
||||||
caption = item.caption_text(self.document)
|
|
||||||
|
|
||||||
figures.append(
|
|
||||||
Figure(
|
|
||||||
prov=[
|
|
||||||
Prov(
|
|
||||||
bbox=p.bbox.as_tuple(),
|
|
||||||
page=p.page_no,
|
|
||||||
span=[0, len(caption)],
|
|
||||||
)
|
|
||||||
for p in item.prov
|
|
||||||
],
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
text=caption,
|
|
||||||
# data=[[]],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
page_dimensions = [
|
|
||||||
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
|
||||||
for p in self.document.pages.values()
|
|
||||||
]
|
|
||||||
|
|
||||||
ds_doc = DsDocument(
|
|
||||||
name=title,
|
|
||||||
description=desc,
|
|
||||||
file_info=file_info,
|
|
||||||
main_text=main_text,
|
|
||||||
equations=equations,
|
|
||||||
footnotes=footnotes,
|
|
||||||
page_headers=page_headers,
|
|
||||||
page_footers=page_footers,
|
|
||||||
tables=tables,
|
|
||||||
figures=figures,
|
|
||||||
page_dimensions=page_dimensions,
|
|
||||||
)
|
|
||||||
|
|
||||||
return ds_doc
|
|
||||||
|
|
||||||
|
|
||||||
class _DummyBackend(AbstractDocumentBackend):
|
class _DummyBackend(AbstractDocumentBackend):
|
||||||
|
@ -190,6 +190,26 @@ class OcrMacOptions(OcrOptions):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Define an enum for the backend options
|
||||||
|
class PdfBackend(str, Enum):
|
||||||
|
"""Enum of valid PDF backends."""
|
||||||
|
|
||||||
|
PYPDFIUM2 = "pypdfium2"
|
||||||
|
DLPARSE_V1 = "dlparse_v1"
|
||||||
|
DLPARSE_V2 = "dlparse_v2"
|
||||||
|
|
||||||
|
|
||||||
|
# Define an enum for the ocr engines
|
||||||
|
class OcrEngine(str, Enum):
|
||||||
|
"""Enum of valid OCR engines."""
|
||||||
|
|
||||||
|
EASYOCR = "easyocr"
|
||||||
|
TESSERACT_CLI = "tesseract_cli"
|
||||||
|
TESSERACT = "tesseract"
|
||||||
|
OCRMAC = "ocrmac"
|
||||||
|
RAPIDOCR = "rapidocr"
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel):
|
class PipelineOptions(BaseModel):
|
||||||
"""Base pipeline options."""
|
"""Base pipeline options."""
|
||||||
|
|
||||||
|
@ -97,24 +97,25 @@ class RapidOcrModel(BaseOcrModel):
|
|||||||
del high_res_image
|
del high_res_image
|
||||||
del im
|
del im
|
||||||
|
|
||||||
cells = [
|
if result is not None:
|
||||||
OcrCell(
|
cells = [
|
||||||
id=ix,
|
OcrCell(
|
||||||
text=line[1],
|
id=ix,
|
||||||
confidence=line[2],
|
text=line[1],
|
||||||
bbox=BoundingBox.from_tuple(
|
confidence=line[2],
|
||||||
coord=(
|
bbox=BoundingBox.from_tuple(
|
||||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
coord=(
|
||||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||||
|
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||||
|
),
|
||||||
|
origin=CoordOrigin.TOPLEFT,
|
||||||
),
|
),
|
||||||
origin=CoordOrigin.TOPLEFT,
|
)
|
||||||
),
|
for ix, line in enumerate(result)
|
||||||
)
|
]
|
||||||
for ix, line in enumerate(result)
|
all_ocr_cells.extend(cells)
|
||||||
]
|
|
||||||
all_ocr_cells.extend(cells)
|
|
||||||
|
|
||||||
# Post-process the cells
|
# Post-process the cells
|
||||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||||
|
@ -10,7 +10,7 @@ For each document format, the *document converter* knows which format-specific *
|
|||||||
|
|
||||||
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
|
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
|
||||||
|
|
||||||
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a *chunker*.
|
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md).
|
||||||
|
|
||||||
For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# CLI Reference
|
# CLI reference
|
||||||
|
|
||||||
This page provides documentation for our command line tools.
|
This page provides documentation for our command line tools.
|
||||||
|
|
||||||
@ -6,4 +6,4 @@ This page provides documentation for our command line tools.
|
|||||||
:module: docling.cli.main
|
:module: docling.cli.main
|
||||||
:command: click_app
|
:command: click_app
|
||||||
:prog_name: docling
|
:prog_name: docling
|
||||||
:style: table
|
:style: table
|
@ -22,9 +22,7 @@ A simple example would look like this:
|
|||||||
docling https://arxiv.org/pdf/2206.01062
|
docling https://arxiv.org/pdf/2206.01062
|
||||||
```
|
```
|
||||||
|
|
||||||
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./cli.md).
|
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Advanced options
|
### Advanced options
|
||||||
|
|
||||||
@ -130,29 +128,37 @@ You can limit the CPU threads used by Docling by setting the environment variabl
|
|||||||
|
|
||||||
## Chunking
|
## Chunking
|
||||||
|
|
||||||
You can perform a hierarchy-aware chunking of a Docling document as follows:
|
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
|
||||||
|
`HybridChunker`, as shown below (for more details check out
|
||||||
|
[this example](examples/hybrid_chunking.ipynb)):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
from docling_core.transforms.chunker import HierarchicalChunker
|
from docling.chunking import HybridChunker
|
||||||
|
|
||||||
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
|
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
|
||||||
doc = conv_res.document
|
doc = conv_res.document
|
||||||
chunks = list(HierarchicalChunker().chunk(doc))
|
|
||||||
|
|
||||||
print(chunks[30])
|
chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5") # set tokenizer as needed
|
||||||
|
chunk_iter = chunker.chunk(doc)
|
||||||
|
```
|
||||||
|
|
||||||
|
An example chunk would look like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
print(list(chunk_iter)[11])
|
||||||
# {
|
# {
|
||||||
# "text": "Lately, new types of ML models for document-layout analysis have emerged [...]",
|
# "text": "In this paper, we present the DocLayNet dataset. [...]",
|
||||||
# "meta": {
|
# "meta": {
|
||||||
# "doc_items": [{
|
# "doc_items": [{
|
||||||
# "self_ref": "#/texts/40",
|
# "self_ref": "#/texts/28",
|
||||||
# "label": "text",
|
# "label": "text",
|
||||||
# "prov": [{
|
# "prov": [{
|
||||||
# "page_no": 2,
|
# "page_no": 2,
|
||||||
# "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...},
|
# "bbox": {"l": 53.29, "t": 287.14, "r": 295.56, "b": 212.37, ...},
|
||||||
# }]
|
# }], ...,
|
||||||
# }],
|
# }, ...],
|
||||||
# "headings": ["2 RELATED WORK"],
|
# "headings": ["1 INTRODUCTION"],
|
||||||
# }
|
# }
|
||||||
# }
|
# }
|
||||||
```
|
```
|
||||||
|
19
mkdocs.yml
19
mkdocs.yml
@ -56,7 +56,6 @@ nav:
|
|||||||
- "Docling": index.md
|
- "Docling": index.md
|
||||||
- Installation: installation.md
|
- Installation: installation.md
|
||||||
- Usage: usage.md
|
- Usage: usage.md
|
||||||
- CLI: cli.md
|
|
||||||
- FAQ: faq.md
|
- FAQ: faq.md
|
||||||
- Docling v2: v2.md
|
- Docling v2: v2.md
|
||||||
- Concepts:
|
- Concepts:
|
||||||
@ -77,15 +76,12 @@ nav:
|
|||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
- "Force full page OCR": examples/full_page_ocr.py
|
- "Force full page OCR": examples/full_page_ocr.py
|
||||||
- "Accelerator options": examples/run_with_acclerators.py
|
- "Accelerator options": examples/run_with_acclerators.py
|
||||||
|
- Chunking:
|
||||||
|
- "Hybrid chunking": examples/hybrid_chunking.ipynb
|
||||||
- RAG / QA:
|
- RAG / QA:
|
||||||
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
||||||
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
||||||
- "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
|
- "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
|
||||||
- Chunking:
|
|
||||||
- "Hybrid chunking": examples/hybrid_chunking.ipynb
|
|
||||||
# - Chunking: examples/chunking.md
|
|
||||||
# - CLI:
|
|
||||||
# - CLI: examples/cli.md
|
|
||||||
- Integrations:
|
- Integrations:
|
||||||
- Integrations: integrations/index.md
|
- Integrations: integrations/index.md
|
||||||
- "🐝 Bee": integrations/bee.md
|
- "🐝 Bee": integrations/bee.md
|
||||||
@ -100,10 +96,13 @@ nav:
|
|||||||
- "spaCy": integrations/spacy.md
|
- "spaCy": integrations/spacy.md
|
||||||
- "txtai": integrations/txtai.md
|
- "txtai": integrations/txtai.md
|
||||||
# - "LangChain 🦜🔗": integrations/langchain.md
|
# - "LangChain 🦜🔗": integrations/langchain.md
|
||||||
- API reference:
|
- Reference:
|
||||||
- Document Converter: api_reference/document_converter.md
|
- Python API:
|
||||||
- Pipeline options: api_reference/pipeline_options.md
|
- Document Converter: reference/document_converter.md
|
||||||
- Docling Document: api_reference/docling_document.md
|
- Pipeline options: reference/pipeline_options.md
|
||||||
|
- Docling Document: reference/docling_document.md
|
||||||
|
- CLI:
|
||||||
|
- CLI reference: reference/cli.md
|
||||||
|
|
||||||
markdown_extensions:
|
markdown_extensions:
|
||||||
- pymdownx.superfences
|
- pymdownx.superfences
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.9.0" # DO NOT EDIT, updated automatically
|
version = "2.10.0" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
Loading…
Reference in New Issue
Block a user