Merge branch 'main' into qdrant

This commit is contained in:
Anush 2024-11-12 18:24:31 +05:30 committed by GitHub
commit 6b47080083
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 137 additions and 67 deletions

View File

@ -1,3 +1,19 @@
## [v2.5.0](https://github.com/DS4SD/docling/releases/tag/v2.5.0) - 2024-11-12
### Feature
* **OCR:** Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning ([#290](https://github.com/DS4SD/docling/issues/290)) ([`c6b3763`](https://github.com/DS4SD/docling/commit/c6b3763ecb6ef862840a30978ee177b907f86505))
### Fix
* Configure env prefix for docling settings ([#315](https://github.com/DS4SD/docling/issues/315)) ([`5d4a10b`](https://github.com/DS4SD/docling/commit/5d4a10b121317fa481208dacbee47032b08ff928))
* Added handling of grouped elements in pptx backend ([#307](https://github.com/DS4SD/docling/issues/307)) ([`81c8243`](https://github.com/DS4SD/docling/commit/81c8243a8bf177feed8f87ea283b5bb6836350cb))
* Allow mps usage for easyocr ([#286](https://github.com/DS4SD/docling/issues/286)) ([`97f214e`](https://github.com/DS4SD/docling/commit/97f214efddcf66f0734a95c17c08936f6111d113))
### Documentation
* Add navigation indices ([#305](https://github.com/DS4SD/docling/issues/305)) ([`1239ade`](https://github.com/DS4SD/docling/commit/1239ade2750349d13d4e865d88449b232bbad944))
## [v2.4.2](https://github.com/DS4SD/docling/releases/tag/v2.4.2) - 2024-11-08 ## [v2.4.2](https://github.com/DS4SD/docling/releases/tag/v2.4.2) - 2024-11-08
### Fix ### Fix

View File

@ -153,6 +153,13 @@ def convert(
..., help="If enabled, the bitmap content will be processed using OCR." ..., help="If enabled, the bitmap content will be processed using OCR."
), ),
] = True, ] = True,
force_ocr: Annotated[
bool,
typer.Option(
...,
help="Replace any existing text with OCR generated text over the full content.",
),
] = False,
ocr_engine: Annotated[ ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.") OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR, ] = OcrEngine.EASYOCR,
@ -219,11 +226,11 @@ def convert(
match ocr_engine: match ocr_engine:
case OcrEngine.EASYOCR: case OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions() ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT_CLI: case OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions() ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT: case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions() ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
case _: case _:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

View File

@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
class OcrOptions(BaseModel): class OcrOptions(BaseModel):
kind: str kind: str
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
bitmap_area_threshold: float = ( bitmap_area_threshold: float = (
0.05 # percentage of the area for a bitmap to processed with OCR 0.05 # percentage of the area for a bitmap to processed with OCR
) )

View File

@ -2,7 +2,7 @@ import sys
from pathlib import Path from pathlib import Path
from pydantic import BaseModel from pydantic import BaseModel
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings, SettingsConfigDict
class DocumentLimits(BaseModel): class DocumentLimits(BaseModel):
@ -40,6 +40,8 @@ class DebugSettings(BaseModel):
class AppSettings(BaseSettings): class AppSettings(BaseSettings):
model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
perf: BatchConcurrencySettings perf: BatchConcurrencySettings
debug: DebugSettings debug: DebugSettings

View File

@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
from rtree import index from rtree import index
from scipy.ndimage import find_objects, label from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
# return full-page rectangle if sufficiently covered with bitmaps # return full-page rectangle if sufficiently covered with bitmaps
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold): if self.options.force_full_page_ocr or coverage > max(
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
):
return [ return [
BoundingBox( BoundingBox(
l=0, l=0,
@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
return ocr_rects return ocr_rects
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell. # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def filter_ocr_cells(self, ocr_cells, programmatic_cells): def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
# Create R-tree index for programmatic cells # Create R-tree index for programmatic cells
p = index.Property() p = index.Property()
p.dimension = 2 p.dimension = 2
@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
] ]
return filtered_ocr_cells return filtered_ocr_cells
def post_process_cells(self, ocr_cells, programmatic_cells):
r"""
Post-process the ocr and programmatic cells and return the final list of of cells
"""
if self.options.force_full_page_ocr:
# If a full page OCR is forced, use only the OCR cells
cells = [
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
for c_ocr in ocr_cells
]
return cells
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image, "RGBA") draw = ImageDraw.Draw(image, "RGBA")

View File

@ -5,7 +5,7 @@ import numpy
import torch import torch
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@ -88,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
] ]
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells. # Post-process the cells
filtered_ocr_cells = self.filter_ocr_cells( page.cells = self.post_process_cells(all_ocr_cells, page.cells)
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
import pandas as pd import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
) )
all_ocr_cells.append(cell) all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells. # Post-process the cells
filtered_ocr_cells = self.filter_ocr_cells( page.cells = self.post_process_cells(all_ocr_cells, page.cells)
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -3,7 +3,7 @@ from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
# del high_res_image # del high_res_image
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells. # Post-process the cells
filtered_ocr_cells = self.filter_ocr_cells( page.cells = self.post_process_cells(all_ocr_cells, page.cells)
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -0,0 +1,42 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/2206.01062.pdf")
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
doc = converter.convert(input_doc).document
md = doc.export_to_markdown()
print(md)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,13 @@
## Get started
Docling is used by the [Data Prep Kit \[↗\]](https://ibm.github.io/data-prep-kit/) open-source toolkit for preparing unstructured data for LLM application development ranging from laptop scale to datacenter scale.
Below you find the Data Prep Kit modules powered by Docling.
## PDF ingestion to Parquet
- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet)
- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/pdf2parquet/python/)
## Document chunking
- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_chunk)
- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/doc_chunk/python/)

View File

@ -1,6 +1,6 @@
## Get started ## Get started
Docling is available as an official LlamaIndex extension! Docling is available as an official [LlamaIndex \[↗\]](https://docs.llamaindex.ai/) extension.
To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/)<!--{target="_blank"}-->. To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/)<!--{target="_blank"}-->.

View File

@ -71,6 +71,7 @@ nav:
- "Figure enrichment": examples/develop_picture_enrichment.py - "Figure enrichment": examples/develop_picture_enrichment.py
- "Table export": examples/export_tables.py - "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py - "Multimodal export": examples/export_multimodal.py
- "Force full page OCR": examples/full_page_ocr.py
- RAG / QA: - RAG / QA:
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
@ -81,8 +82,9 @@ nav:
# - CLI: examples/cli.md # - CLI: examples/cli.md
- Integrations: - Integrations:
- Integrations: integrations/index.md - Integrations: integrations/index.md
- "LlamaIndex 🦙 extension": integrations/llamaindex.md - "Data Prep Kit": integrations/data_prep_kit.md
# - "LangChain 🦜🔗 extension": integrations/langchain.md - "LlamaIndex 🦙": integrations/llamaindex.md
# - "LangChain 🦜🔗": integrations/langchain.md
# - API reference: # - API reference:
# - API reference: api_reference/index.md # - API reference: api_reference/index.md

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "docling" name = "docling"
version = "2.4.2" # DO NOT EDIT, updated automatically version = "2.5.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT" license = "MIT"

View File

@ -15,34 +15,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE = False GENERATE_V1 = False
GENERATE_V2 = False
# Debug
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
r""" """
import json
import os
parent = pdf_path.parent
eng = "" if engine is None else f".{engine}"
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
with open(dict_fn, "w") as fd:
json.dump(doc_result.legacy_document.export_to_dict(), fd)
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
pages = [p.model_dump() for p in doc_result.pages]
with open(pages_fn, "w") as fd:
json.dump(pages, fd)
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd:
fd.write(doc_result.legacy_document.export_to_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd:
fd.write(doc_result.legacy_document.export_to_markdown())
def get_pdf_paths(): def get_pdf_paths():
@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):
def test_e2e_conversions(): def test_e2e_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
engines: List[OcrOptions] = [ engines: List[OcrOptions] = [
EasyOcrOptions(), EasyOcrOptions(),
TesseractOcrOptions(), TesseractOcrOptions(),
TesseractCliOcrOptions(), TesseractCliOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True),
] ]
for ocr_options in engines: for ocr_options in engines:
@ -91,20 +67,16 @@ def test_e2e_conversions():
doc_result: ConversionResult = converter.convert(pdf_path) doc_result: ConversionResult = converter.convert(pdf_path)
# Save conversions
# save_output(pdf_path, doc_result, None)
# Debug
verify_conversion_result_v1( verify_conversion_result_v1(
input_path=pdf_path, input_path=pdf_path,
doc_result=doc_result, doc_result=doc_result,
generate=GENERATE, generate=GENERATE_V1,
fuzzy=True, fuzzy=True,
) )
verify_conversion_result_v2( verify_conversion_result_v2(
input_path=pdf_path, input_path=pdf_path,
doc_result=doc_result, doc_result=doc_result,
generate=GENERATE, generate=GENERATE_V2,
fuzzy=True, fuzzy=True,
) )

View File

@ -256,15 +256,19 @@ def verify_conversion_result_v1(
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt") dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw: with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder)) fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw: with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder)) fw.write(json.dumps(doc_pred, default=pydantic_encoder))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw: with open(md_path, "w") as fw:
fw.write(doc_pred_md) fw.write(doc_pred_md)
dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw: with open(dt_path, "w") as fw:
fw.write(doc_pred_dt) fw.write(doc_pred_dt)
else: # default branch in test else: # default branch in test
@ -328,15 +332,19 @@ def verify_conversion_result_v2(
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt") dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw: with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder)) fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw: with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder)) fw.write(json.dumps(doc_pred, default=pydantic_encoder))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw: with open(md_path, "w") as fw:
fw.write(doc_pred_md) fw.write(doc_pred_md)
dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw: with open(dt_path, "w") as fw:
fw.write(doc_pred_dt) fw.write(doc_pred_dt)
else: # default branch in test else: # default branch in test