mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Merge remote-tracking branch 'origin/main' into mao1/code_equation_model
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
commit
ecc715d506
5
.github/workflows/docs.yml
vendored
5
.github/workflows/docs.yml
vendored
@ -14,7 +14,10 @@ jobs:
|
||||
- uses: ./.github/actions/setup-poetry
|
||||
- name: Build docs
|
||||
run: poetry run mkdocs build --verbose --clean
|
||||
- name: Make docs LLM ready
|
||||
if: inputs.deploy
|
||||
uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b
|
||||
- name: Build and push docs
|
||||
if: inputs.deploy
|
||||
run: poetry run mkdocs gh-deploy --force
|
||||
run: poetry run mkdocs gh-deploy --force --dirty
|
||||
|
@ -54,12 +54,12 @@ tokens), &
|
||||
chunks with same headings & captions) — users can opt out of this step via param
|
||||
`merge_peers` (by default `True`)
|
||||
|
||||
👉 Example: see [here](../../examples/hybrid_chunking).
|
||||
👉 Example: see [here](../examples/hybrid_chunking.ipynb).
|
||||
|
||||
## Hierarchical Chunker
|
||||
|
||||
The `HierarchicalChunker` implementation uses the document structure information from
|
||||
the [`DoclingDocument`](../docling_document) to create one chunk for each individual
|
||||
the [`DoclingDocument`](./docling_document.md) to create one chunk for each individual
|
||||
detected document element, by default only merging together list items (can be opted out
|
||||
via param `merge_list_items`). It also takes care of attaching all relevant document
|
||||
metadata, including headers and captions.
|
||||
|
@ -5,7 +5,11 @@ from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.ocr_mac_model import OcrMacOptions
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
@ -76,7 +80,7 @@ def main():
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
pipeline_options.ocr_options.lang = ["es"]
|
||||
pipeline_options.accelerator_options = AcceleratorOptions(
|
||||
num_threads=4, device=Device.AUTO
|
||||
num_threads=4, device=AcceleratorDevice.AUTO
|
||||
)
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
|
@ -95,8 +95,8 @@ doc_converter = (
|
||||
|
||||
More options are shown in the following example units:
|
||||
|
||||
- [run_with_formats.py](../examples/run_with_formats/)
|
||||
- [custom_convert.py](../examples/custom_convert/)
|
||||
- [run_with_formats.py](examples/run_with_formats.py)
|
||||
- [custom_convert.py](examples/custom_convert.py)
|
||||
|
||||
### Converting documents
|
||||
|
||||
@ -226,4 +226,4 @@ leverages the new `DoclingDocument` and provides a new, richer chunk output form
|
||||
- any applicable headings for context
|
||||
- any applicable captions for context
|
||||
|
||||
For an example, check out [Chunking usage](../usage/#chunking).
|
||||
For an example, check out [Chunking usage](usage.md#chunking).
|
||||
|
882
poetry.lock
generated
882
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -29,7 +29,7 @@ pydantic = "^2.0.0"
|
||||
docling-core = { version = "^2.14.0", extras = ["chunking"] }
|
||||
docling-ibm-models = "^3.1.0"
|
||||
deepsearch-glm = "^1.0.0"
|
||||
docling-parse = "^3.0.0"
|
||||
docling-parse = "^3.1.0"
|
||||
filetype = "^1.2.0"
|
||||
pypdfium2 = "^4.30.0"
|
||||
pydantic-settings = "^2.3.0"
|
||||
|
Loading…
Reference in New Issue
Block a user