From c23916488bd9169c3f80bcad29c73b1dac3ff566 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Wed, 16 Oct 2024 22:12:34 +0200 Subject: [PATCH] docs: update chunking docs, rename page [skip ci] Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/use_docling.md | 27 ++++++++++++++++++--------- mkdocs.yml | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/use_docling.md b/docs/use_docling.md index 733af19d..84e0df0d 100644 --- a/docs/use_docling.md +++ b/docs/use_docling.md @@ -139,14 +139,23 @@ You can perform a hierarchy-aware chunking of a Docling document as follows: from docling.document_converter import DocumentConverter from docling_core.transforms.chunker import HierarchicalChunker -doc = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062").legacy_document +conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062") +doc = conv_res.document chunks = list(HierarchicalChunker().chunk(doc)) -print(chunks[0]) -# ChunkWithMetadata( -# path='#/main-text/1', -# text='DocLayNet: A Large Human-Annotated Dataset [...]', -# page=1, -# bbox=[107.30, 672.38, 505.19, 709.08], -# [...] -# ) + +print(chunks[30]) +# { +# "text": "Lately, new types of ML models for document-layout analysis have emerged [...]", +# "meta": { +# "doc_items": [{ +# "self_ref": "#/texts/40", +# "label": "text", +# "prov": [{ +# "page_no": 2, +# "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...}, +# }] +# }], +# "headings": ["2 RELATED WORK"], +# } +# } ``` diff --git a/mkdocs.yml b/mkdocs.yml index 5fd180a4..f844b075 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -54,7 +54,7 @@ nav: - Get started: - Home: index.md - Installation: installation.md - - Use Docling: use_docling.md + - Usage: use_docling.md - Docling v2: v2.md - Concepts: - The Docling Document format: concepts/docling_format.md