mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
docs: update chunking docs, rename page
[skip ci] Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
08b0cc7aed
commit
c23916488b
@ -139,14 +139,23 @@ You can perform a hierarchy-aware chunking of a Docling document as follows:
|
|||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
from docling_core.transforms.chunker import HierarchicalChunker
|
from docling_core.transforms.chunker import HierarchicalChunker
|
||||||
|
|
||||||
doc = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062").legacy_document
|
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
|
||||||
|
doc = conv_res.document
|
||||||
chunks = list(HierarchicalChunker().chunk(doc))
|
chunks = list(HierarchicalChunker().chunk(doc))
|
||||||
print(chunks[0])
|
|
||||||
# ChunkWithMetadata(
|
print(chunks[30])
|
||||||
# path='#/main-text/1',
|
# {
|
||||||
# text='DocLayNet: A Large Human-Annotated Dataset [...]',
|
# "text": "Lately, new types of ML models for document-layout analysis have emerged [...]",
|
||||||
# page=1,
|
# "meta": {
|
||||||
# bbox=[107.30, 672.38, 505.19, 709.08],
|
# "doc_items": [{
|
||||||
# [...]
|
# "self_ref": "#/texts/40",
|
||||||
# )
|
# "label": "text",
|
||||||
|
# "prov": [{
|
||||||
|
# "page_no": 2,
|
||||||
|
# "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...},
|
||||||
|
# }]
|
||||||
|
# }],
|
||||||
|
# "headings": ["2 RELATED WORK"],
|
||||||
|
# }
|
||||||
|
# }
|
||||||
```
|
```
|
||||||
|
@ -54,7 +54,7 @@ nav:
|
|||||||
- Get started:
|
- Get started:
|
||||||
- Home: index.md
|
- Home: index.md
|
||||||
- Installation: installation.md
|
- Installation: installation.md
|
||||||
- Use Docling: use_docling.md
|
- Usage: use_docling.md
|
||||||
- Docling v2: v2.md
|
- Docling v2: v2.md
|
||||||
- Concepts:
|
- Concepts:
|
||||||
- The Docling Document format: concepts/docling_format.md
|
- The Docling Document format: concepts/docling_format.md
|
||||||
|
Loading…
Reference in New Issue
Block a user