mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Merge branch 'main' of github.com:DS4SD/docling into cau/handle-furniture
This commit is contained in:
commit
e730e59d1d
2
.github/workflows/checks.yml
vendored
2
.github/workflows/checks.yml
vendored
@ -28,7 +28,7 @@ jobs:
|
||||
run: |
|
||||
for file in docs/examples/*.py; do
|
||||
# Skip batch_convert.py
|
||||
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then
|
||||
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then
|
||||
echo "Skipping $file"
|
||||
continue
|
||||
fi
|
||||
|
6
.github/workflows/docs.yml
vendored
6
.github/workflows/docs.yml
vendored
@ -14,10 +14,6 @@ jobs:
|
||||
- uses: ./.github/actions/setup-poetry
|
||||
- name: Build docs
|
||||
run: poetry run mkdocs build --verbose --clean
|
||||
- name: Make docs LLM ready
|
||||
if: inputs.deploy
|
||||
uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b
|
||||
- name: Build and push docs
|
||||
if: inputs.deploy
|
||||
run: poetry run mkdocs gh-deploy --force --dirty
|
||||
|
||||
run: poetry run mkdocs gh-deploy --force
|
||||
|
26
CHANGELOG.md
26
CHANGELOG.md
@ -1,3 +1,29 @@
|
||||
## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03
|
||||
|
||||
### Feature
|
||||
|
||||
* Expose equation exports ([#869](https://github.com/DS4SD/docling/issues/869)) ([`6a76b49`](https://github.com/DS4SD/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8))
|
||||
* Add option to define page range ([#852](https://github.com/DS4SD/docling/issues/852)) ([`70d68b6`](https://github.com/DS4SD/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381))
|
||||
* **docx:** Support of SDTs in docx backend ([#853](https://github.com/DS4SD/docling/issues/853)) ([`d727b04`](https://github.com/DS4SD/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e))
|
||||
* Python 3.13 support ([#841](https://github.com/DS4SD/docling/issues/841)) ([`4df085a`](https://github.com/DS4SD/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f))
|
||||
|
||||
### Fix
|
||||
|
||||
* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/DS4SD/docling/issues/873)) ([`5ac2887`](https://github.com/DS4SD/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70))
|
||||
* **markdown:** Add support for HTML content ([#855](https://github.com/DS4SD/docling/issues/855)) ([`94751a7`](https://github.com/DS4SD/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62))
|
||||
* **docx:** Merged table cells not properly converted ([#857](https://github.com/DS4SD/docling/issues/857)) ([`0cd81a8`](https://github.com/DS4SD/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0))
|
||||
* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/DS4SD/docling/issues/868)) ([`eff16b6`](https://github.com/DS4SD/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a))
|
||||
* KeyError in tableformer prediction ([#854](https://github.com/DS4SD/docling/issues/854)) ([`b1cf796`](https://github.com/DS4SD/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee))
|
||||
* Fixed docx import with headers that are also lists ([#842](https://github.com/DS4SD/docling/issues/842)) ([`2c037ae`](https://github.com/DS4SD/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3))
|
||||
* Use new add_code in html backend and add more typing hints ([#850](https://github.com/DS4SD/docling/issues/850)) ([`2a1f8af`](https://github.com/DS4SD/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933))
|
||||
* **markdown:** Fix empty block handling ([#843](https://github.com/DS4SD/docling/issues/843)) ([`bccb022`](https://github.com/DS4SD/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9))
|
||||
* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/DS4SD/docling/issues/837)) ([`fea0a99`](https://github.com/DS4SD/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Updated the readme with upcoming features ([#831](https://github.com/DS4SD/docling/issues/831)) ([`d7c0828`](https://github.com/DS4SD/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd))
|
||||
* Add example for inspection of picture content ([#624](https://github.com/DS4SD/docling/issues/624)) ([`f9144f2`](https://github.com/DS4SD/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781))
|
||||
|
||||
## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28
|
||||
|
||||
### Feature
|
||||
|
@ -368,6 +368,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Start iterating from the root of the AST
|
||||
self.iterate_elements(parsed_ast, 0, doc, None)
|
||||
self.process_inline_text(None, doc) # handle last hanging inline text
|
||||
self.close_table(doc=doc) # handle any last hanging table
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
if self._html_blocks > 0:
|
||||
|
@ -219,6 +219,13 @@ def convert(
|
||||
bool,
|
||||
typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
|
||||
] = False,
|
||||
enrich_picture_classes: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
help="Enable the picture classification enrichment model in the pipeline.",
|
||||
),
|
||||
] = False,
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
typer.Option(..., help="If provided, the location of the model artifacts."),
|
||||
@ -375,6 +382,7 @@ def convert(
|
||||
do_table_structure=True,
|
||||
do_code_enrichment=enrich_code,
|
||||
do_formula_enrichment=enrich_formula,
|
||||
do_picture_classification=enrich_picture_classes,
|
||||
document_timeout=document_timeout,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = (
|
||||
|
@ -6,6 +6,7 @@ from typing_extensions import TypeVar
|
||||
|
||||
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
|
||||
|
||||
class BasePageModel(ABC):
|
||||
@ -21,6 +22,8 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
||||
|
||||
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
||||
|
||||
elements_batch_size: int = settings.perf.elements_batch_size
|
||||
|
||||
@abstractmethod
|
||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||
pass
|
||||
|
@ -61,6 +61,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
||||
Processes the given batch of elements and enriches them with predictions.
|
||||
"""
|
||||
|
||||
elements_batch_size = 5
|
||||
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
||||
expansion_factor = 0.03
|
||||
|
||||
|
@ -79,7 +79,7 @@ class BasePipeline(ABC):
|
||||
for model in self.enrichment_pipe:
|
||||
for element_batch in chunkify(
|
||||
_prepare_elements(conv_res, model),
|
||||
settings.perf.elements_batch_size,
|
||||
model.elements_batch_size,
|
||||
):
|
||||
for element in model(
|
||||
doc=conv_res.document, element_batch=element_batch
|
||||
|
58
docs/examples/rapidocr_with_custom_models.py
Normal file
58
docs/examples/rapidocr_with_custom_models.py
Normal file
@ -0,0 +1,58 @@
|
||||
import os
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
|
||||
from docling.document_converter import (
|
||||
ConversionResult,
|
||||
DocumentConverter,
|
||||
InputFormat,
|
||||
PdfFormatOption,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
# Source document to convert
|
||||
source = "https://arxiv.org/pdf/2408.09869v4"
|
||||
|
||||
# Download RappidOCR models from HuggingFace
|
||||
print("Downloading RapidOCR models")
|
||||
download_path = snapshot_download(repo_id="SWHL/RapidOCR")
|
||||
|
||||
# Setup RapidOcrOptions for english detection
|
||||
det_model_path = os.path.join(
|
||||
download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"
|
||||
)
|
||||
rec_model_path = os.path.join(
|
||||
download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"
|
||||
)
|
||||
cls_model_path = os.path.join(
|
||||
download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
|
||||
)
|
||||
ocr_options = RapidOcrOptions(
|
||||
det_model_path=det_model_path,
|
||||
rec_model_path=rec_model_path,
|
||||
cls_model_path=cls_model_path,
|
||||
)
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
ocr_options=ocr_options,
|
||||
)
|
||||
|
||||
# Convert the document
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
conversion_result: ConversionResult = converter.convert(source=source)
|
||||
doc = conversion_result.document
|
||||
md = doc.export_to_markdown()
|
||||
print(md)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -77,6 +77,7 @@ nav:
|
||||
- "Multimodal export": examples/export_multimodal.py
|
||||
- "Force full page OCR": examples/full_page_ocr.py
|
||||
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
||||
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
|
||||
- "Accelerator options": examples/run_with_accelerator.py
|
||||
- "Simple translation": examples/translate.py
|
||||
- examples/backend_xml_rag.ipynb
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "docling"
|
||||
version = "2.17.0" # DO NOT EDIT, updated automatically
|
||||
version = "2.18.0" # DO NOT EDIT, updated automatically
|
||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||
license = "MIT"
|
||||
|
@ -0,0 +1,6 @@
|
||||
| Character | Name in German | Name in French | Name in Italian |
|
||||
|----------------|------------------|------------------|-------------------|
|
||||
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
|
||||
| Huey | Tick | Riri | Qui |
|
||||
| Dewey | Trick | Fifi | Quo |
|
||||
| Louie | Track | Loulou | Qua |
|
6
tests/data/md/ending_with_table.md
Normal file
6
tests/data/md/ending_with_table.md
Normal file
@ -0,0 +1,6 @@
|
||||
| Character | Name in German | Name in French | Name in Italian |
|
||||
|---|---|---|---|
|
||||
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
|
||||
| Huey | Tick | Riri | Qui |
|
||||
| Dewey | Trick | Fifi | Quo |
|
||||
| Louie | Track | Loulou | Qua |
|
Loading…
Reference in New Issue
Block a user