From f587e3cee819c18ef76ae7bd9a71d5e870371dec Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Mon, 3 Mar 2025 11:39:57 +0100 Subject: [PATCH] minor reorg of top-level docs Signed-off-by: Panos Vagenas --- README.md | 2 +- .../examples/develop_formula_understanding.py | 2 +- docs/examples/develop_picture_enrichment.py | 2 +- docs/{faq.md => faq/index.md} | 2 +- docs/index.md | 2 +- .../index.md} | 0 docs/{ => usage}/enrichments.md | 14 +++++------ docs/{usage.md => usage/index.md} | 15 ++++++------ docs/{ => usage}/supported_formats.md | 4 ++-- mkdocs.yml | 23 +++++++++++-------- 10 files changed, 36 insertions(+), 30 deletions(-) rename docs/{faq.md => faq/index.md} (98%) rename docs/{installation.md => installation/index.md} (100%) rename docs/{ => usage}/enrichments.md (95%) rename docs/{usage.md => usage/index.md} (93%) rename docs/{ => usage}/supported_formats.md (87%) diff --git a/README.md b/README.md index 5a957d60..842253e9 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,6 @@ For individual model usage, please refer to the model licenses found in the orig Docling has been brought to you by IBM. -[supported_formats]: https://ds4sd.github.io/docling/supported_formats/ +[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/ [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ [integrations]: https://ds4sd.github.io/docling/integrations/ diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index 3cdef489..ca24d95d 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -1,6 +1,6 @@ # WARNING # This example demonstrates only how to develop a new enrichment model. -# It does not run thr actual formula understanding model. +# It does not run the actual formula understanding model. import logging from pathlib import Path diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py index 362ec615..9991afe9 100644 --- a/docs/examples/develop_picture_enrichment.py +++ b/docs/examples/develop_picture_enrichment.py @@ -1,6 +1,6 @@ # WARNING # This example demonstrates only how to develop a new enrichment model. -# It does not run thr actual picture classifier model. +# It does not run the actual picture classifier model. import logging from pathlib import Path diff --git a/docs/faq.md b/docs/faq/index.md similarity index 98% rename from docs/faq.md rename to docs/faq/index.md index ae57446f..d1d05a8f 100644 --- a/docs/faq.md +++ b/docs/faq/index.md @@ -149,7 +149,7 @@ This is a collection of FAQ collected from the user questions on Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors This is a warning that is emitted by transformers, saying that actually *running this sequence through the model* will result in indexing errors, i.e. the problematic case is only if one indeed passes the particular sequence through the (embedding) model. diff --git a/docs/index.md b/docs/index.md index 18c12352..a0a1bb16 100644 --- a/docs/index.md +++ b/docs/index.md @@ -47,6 +47,6 @@ Docling simplifies document processing, parsing diverse formats — including ad Docling has been brought to you by IBM. -[supported_formats]: ./supported_formats.md +[supported_formats]: ./usage/supported_formats.md [docling_document]: ./concepts/docling_document.md [integrations]: ./integrations/index.md diff --git a/docs/installation.md b/docs/installation/index.md similarity index 100% rename from docs/installation.md rename to docs/installation/index.md diff --git a/docs/enrichments.md b/docs/usage/enrichments.md similarity index 95% rename from docs/enrichments.md rename to docs/usage/enrichments.md index 8ea5f176..96e10f16 100644 --- a/docs/enrichments.md +++ b/docs/usage/enrichments.md @@ -6,10 +6,10 @@ The following table provides an overview of the default enrichment models availa | Feature | Parameter | Processed item | Description | | ------- | --------- | ---------------| ----------- | -| Code understanding | `do_code_enrichment` | `CodeItem` | See [docs below](#code-understanding). | -| Formula understanding | `do_formula_enrichment` | `TextItem` with label `FORMULA` | See [docs below](#formula-understanding). | -| Picrure classification | `do_picture_classification` | `PictureItem` | See [docs below](#picture-classification). | -| Picture description | `do_picture_description` | `PictureItem` | See [docs below](#picture-description). | +| Code understanding | `do_code_enrichment` | `CodeItem` | See [docs below](#code-understanding). | +| Formula understanding | `do_formula_enrichment` | `TextItem` with label `FORMULA` | See [docs below](#formula-understanding). | +| Picrure classification | `do_picture_classification` | `PictureItem` | See [docs below](#picture-classification). | +| Picture description | `do_picture_description` | `PictureItem` | See [docs below](#picture-description). | ## Enrichments details @@ -204,7 +204,7 @@ pipeline_options.picture_description_options = PictureDescriptionApiOptions( End-to-end code snippets for cloud providers are available in the examples section: -- [IBM watsonx.ai](./examples/pictures_description_api.py) +- [IBM watsonx.ai](../examples/pictures_description_api.py) ## Develop new enrichment models @@ -212,5 +212,5 @@ End-to-end code snippets for cloud providers are available in the examples secti Beside looking at the implementation of all the models listed above, the Docling documentation has a few examples dedicated to the implementation of enrichment models. -- [Develop picture enrichment](./examples/develop_picture_enrichment.py) -- [Develop formula enrichment](./examples/develop_formula_understanding.py) +- [Develop picture enrichment](../examples/develop_picture_enrichment.py) +- [Develop formula enrichment](../examples/develop_formula_understanding.py) diff --git a/docs/usage.md b/docs/usage/index.md similarity index 93% rename from docs/usage.md rename to docs/usage/index.md index 4b5e4ba1..cd01335e 100644 --- a/docs/usage.md +++ b/docs/usage/index.md @@ -1,3 +1,4 @@ + ## Conversion ### Convert a single document @@ -22,7 +23,7 @@ A simple example would look like this: docling https://arxiv.org/pdf/2206.01062 ``` -To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md). +To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](../reference/cli.md). ### Advanced options @@ -104,7 +105,7 @@ The options in this list require the explicit `enable_remote_services=True` when #### Adjust pipeline features -The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways +The example file [custom_convert.py](../examples/custom_convert.py) contains multiple ways one can adjust the conversion pipeline and features. ##### Control PDF table extraction options @@ -183,13 +184,13 @@ You can limit the CPU threads used by Docling by setting the environment variabl !!! note - This section discusses directly invoking a [backend](./concepts/architecture.md), + This section discusses directly invoking a [backend](../concepts/architecture.md), i.e. using a low-level API. This should only be done when necessary. For most cases, using a `DocumentConverter` (high-level API) as discussed in the sections above should suffice — and is the recommended way. -By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](./supported_formats.md)). -You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example. +By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](../supported_formats.md)). +You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](../examples/run_with_formats.py) example. Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages: ```python @@ -214,9 +215,9 @@ print(dl_doc.export_to_markdown()) ## Chunking -You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a +You can chunk a Docling document using a [chunker](../concepts/chunking.md), such as a `HybridChunker`, as shown below (for more details check out -[this example](examples/hybrid_chunking.ipynb)): +[this example](../examples/hybrid_chunking.ipynb)): ```python from docling.document_converter import DocumentConverter diff --git a/docs/supported_formats.md b/docs/usage/supported_formats.md similarity index 87% rename from docs/supported_formats.md rename to docs/usage/supported_formats.md index 0892ae1a..4d1ca4f9 100644 --- a/docs/supported_formats.md +++ b/docs/usage/supported_formats.md @@ -1,6 +1,6 @@ Docling can parse various documents formats into a unified representation (Docling Document), which it can export to different formats too — check out -[Architecture](./concepts/architecture.md) for more details. +[Architecture](../concepts/architecture.md) for more details. Below you can find a listing of all supported input and output formats. @@ -22,7 +22,7 @@ Schema-specific support: |--------|-------------| | USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents | | JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles | -| Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) | +| Docling JSON | JSON-serialized [Docling Document](../concepts/docling_document.md) | ## Supported output formats diff --git a/mkdocs.yml b/mkdocs.yml index 4813762c..b0c01c5e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -54,12 +54,14 @@ theme: nav: - Home: - "Docling": index.md - - Installation: installation.md - - Usage: usage.md - - Supported formats: supported_formats.md - - Enrichment features: enrichments.md - - FAQ: faq.md - - Docling v2: v2.md + - Installation: + - Installation: installation/index.md + - Usage: + - Usage: usage/index.md + - Supported formats: usage/supported_formats.md + - Enrichment features: usage/enrichments.md + - FAQ: + - FAQ: faq/index.md - Concepts: - Concepts: concepts/index.md - Architecture: concepts/architecture.md @@ -73,11 +75,8 @@ nav: - "Batch conversion": examples/batch_convert.py - "Multi-format conversion": examples/run_with_formats.py - "Figure export": examples/export_figures.py - - "Figure enrichment": examples/develop_picture_enrichment.py - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py - - "Annotate picture with local vlm": examples/pictures_description.ipynb - - "Annotate picture with remote vlm": examples/pictures_description_api.py - "Force full page OCR": examples/full_page_ocr.py - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py @@ -91,6 +90,12 @@ nav: - examples/rag_haystack.ipynb - examples/rag_langchain.ipynb - examples/rag_llamaindex.ipynb + - 🖼️ Picture annotation: + - "Annotate picture with local VLM": examples/pictures_description.ipynb + - "Annotate picture with remote VLM": examples/pictures_description_api.py + - ✨ Enrichment development: + - "Figure enrichment": examples/develop_picture_enrichment.py + - "Formula enrichment": examples/develop_formula_understanding.py - 🗂️ More examples: - examples/rag_weaviate.ipynb - RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb