Enable mypy and fix many reported errors

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-15 14:58:00 +02:00
parent d687f93d52
commit 27f4ed3620
26 changed files with 205 additions and 920 deletions

View File

@ -20,12 +20,12 @@ repos:
# pass_filenames: false # pass_filenames: false
# language: system # language: system
# files: '\.py$' # files: '\.py$'
# - id: mypy - id: mypy
# name: MyPy name: MyPy
# entry: poetry run mypy docling entry: poetry run mypy docling
# pass_filenames: false pass_filenames: false
# language: system language: system
# files: '\.py$' files: '\.py$'
- id: nbqa_black - id: nbqa_black
name: nbQA Black name: nbQA Black
entry: poetry run nbqa black examples entry: poetry run nbqa black examples

View File

@ -2,7 +2,7 @@ import logging
import random import random
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Union from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -13,6 +13,9 @@ from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size from docling.datamodel.base_models import Cell, Size
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -30,10 +30,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Initialise the parents for the hierarchy # Initialise the parents for the hierarchy
self.max_levels = 10 self.max_levels = 10
self.level = 0 self.level = 0
self.parents = {} self.parents = {} # type: ignore
for i in range(0, self.max_levels): for i in range(0, self.max_levels):
self.parents[i] = None self.parents[i] = None
self.labels = {} self.labels = {} # type: ignore
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
@ -49,8 +49,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
) from e ) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return self.soup is not None
@classmethod
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return False return False
@ -68,11 +69,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
_log.debug("Trying to convert HTML...") _log.debug("Trying to convert HTML...")
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
doc = self.walk(self.soup.body, doc)
if self.is_valid():
assert self.soup is not None
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc return doc
def walk(self, element, doc): def walk(self, element, doc):

View File

@ -42,7 +42,11 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.pptx_obj = None self.pptx_obj = None
self.valid = False self.valid = False
try: try:
self.pptx_obj = Presentation(self.path_or_stream) if isinstance(self.path_or_stream, BytesIO):
self.pptx_obj = Presentation(self.path_or_stream)
elif isinstance(self.path_or_stream, Path):
self.pptx_obj = Presentation(str(self.path_or_stream))
self.valid = True self.valid = True
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
@ -53,6 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
def page_count(self) -> int: def page_count(self) -> int:
if self.is_valid(): if self.is_valid():
assert self.pptx_obj is not None
return len(self.pptx_obj.slides) return len(self.pptx_obj.slides)
else: else:
return 0 return 0
@ -60,6 +65,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@classmethod
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return True # True? if so, how to handle pages... return True # True? if so, how to handle pages...
@ -311,10 +317,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
slide_width = pptx_obj.slide_width slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height slide_height = pptx_obj.slide_height
text_content = [] text_content = [] # type: ignore
max_levels = 10 max_levels = 10
parents = {} parents = {} # type: ignore
for i in range(0, max_levels): for i in range(0, max_levels):
parents[i] = None parents[i] = None

View File

@ -39,7 +39,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Initialise the parents for the hierarchy # Initialise the parents for the hierarchy
self.max_levels = 10 self.max_levels = 10
self.level_at_new_list = None self.level_at_new_list = None
self.parents = {} self.parents = {} # type: ignore
for i in range(-1, self.max_levels): for i in range(-1, self.max_levels):
self.parents[i] = None self.parents[i] = None
@ -54,16 +54,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.docx_obj = None self.docx_obj = None
try: try:
self.docx_obj = docx.Document(self.path_or_stream) if isinstance(self.path_or_stream, BytesIO):
self.docx_obj = docx.Document(self.path_or_stream)
elif isinstance(self.path_or_stream, Path):
self.docx_obj = docx.Document(str(self.path_or_stream))
self.valid = True self.valid = True
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}" f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e ) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return self.valid
@classmethod
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return False return False
@ -80,10 +85,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model. # Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
if self.is_valid():
# self.initialise() assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc return doc
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
def update_history(self, name, level, numid, ilevel): def update_history(self, name, level, numid, ilevel):
self.history["names"].append(name) self.history["names"].append(name)
@ -307,7 +316,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
): ):
level = self.get_level() level = self.get_level()
if self.prev_numid() is None: # Open new list if self.prev_numid() is None: # Open new list
self.level_at_new_list = level self.level_at_new_list = level # type: ignore
self.parents[level] = doc.add_group( self.parents[level] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]

View File

@ -1,9 +1,9 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from io import BytesIO from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Set, Union from typing import Iterable, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc import BoundingBox, Size
from docling_core.types.legacy_doc.doc_ocr import Path
from PIL import Image from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend from docling.backend.abstract_backend import PaginatedDocumentBackend

View File

@ -2,7 +2,7 @@ import logging
import random import random
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Union from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
@ -14,6 +14,9 @@ from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell from docling.datamodel.base_models import Cell
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrOptions,
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
@ -179,7 +180,7 @@ def convert(
raise typer.Abort() raise typer.Abort()
elif source.is_dir(): elif source.is_dir():
for fmt in from_formats: for fmt in from_formats:
for ext in FormatToExtensions.get(fmt): for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}"))) input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}"))) input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
else: else:
@ -195,7 +196,7 @@ def convert(
match ocr_engine: match ocr_engine:
case OcrEngine.EASYOCR: case OcrEngine.EASYOCR:
ocr_options = EasyOcrOptions() ocr_options: OcrOptions = EasyOcrOptions()
case OcrEngine.TESSERACT_CLI: case OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions() ocr_options = TesseractCliOcrOptions()
case OcrEngine.TESSERACT: case OcrEngine.TESSERACT:

View File

@ -126,7 +126,8 @@ class TableStructurePrediction(BaseModel):
table_map: Dict[int, Table] = {} table_map: Dict[int, Table] = {}
class TextElement(BasePageElement): ... class TextElement(BasePageElement):
text: str
class FigureElement(BasePageElement): class FigureElement(BasePageElement):

View File

@ -3,7 +3,7 @@ import re
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
import filetype import filetype
from docling_core.types import BaseText from docling_core.types import BaseText
@ -29,7 +29,10 @@ from docling_core.utils.file import resolve_file_source
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import (
AbstractDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConversionStatus, ConversionStatus,
@ -70,41 +73,34 @@ layout_label_to_ds_type = {
DocItemLabel.PARAGRAPH: "paragraph", DocItemLabel.PARAGRAPH: "paragraph",
} }
_EMPTY_LEGACY_DOC = DsDocument(
_name="",
description=DsDocumentDescription(logs=[]),
file_info=DsFileInfoObject(
filename="",
document_hash="",
),
)
_EMPTY_DOCLING_DOC = DoclingDocument( _EMPTY_DOCLING_DOC = DoclingDocument(
description=DescriptionItem(), name="dummy" description=DescriptionItem(), name="dummy"
) # TODO: Stub ) # TODO: Stub
class InputDocument(BaseModel): class InputDocument(BaseModel):
file: PurePath = None file: PurePath
document_hash: Optional[str] = None document_hash: str # = None
valid: bool = True valid: bool = True
limits: DocumentLimits = DocumentLimits() limits: DocumentLimits = DocumentLimits()
format: Optional[InputFormat] = None format: InputFormat # = None
filesize: Optional[int] = None filesize: Optional[int] = None
page_count: int = 0 page_count: int = 0
_backend: AbstractDocumentBackend = None # Internal PDF backend used _backend: AbstractDocumentBackend # Internal PDF backend used
def __init__( def __init__(
self, self,
path_or_stream: Union[BytesIO, Path], path_or_stream: Union[BytesIO, Path],
format: InputFormat, format: InputFormat,
backend: AbstractDocumentBackend, backend: Type[AbstractDocumentBackend],
filename: Optional[str] = None, filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None, limits: Optional[DocumentLimits] = None,
): ):
super().__init__() super().__init__(
file="", document_hash="", format=InputFormat.PDF
) # initialize with dummy values
self.limits = limits or DocumentLimits() self.limits = limits or DocumentLimits()
self.format = format self.format = format
@ -120,6 +116,9 @@ class InputDocument(BaseModel):
self._init_doc(backend, path_or_stream) self._init_doc(backend, path_or_stream)
elif isinstance(path_or_stream, BytesIO): elif isinstance(path_or_stream, BytesIO):
assert (
filename is not None
), "Can't construct InputDocument from stream without providing filename arg."
self.file = PurePath(filename) self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes self.filesize = path_or_stream.getbuffer().nbytes
@ -128,10 +127,16 @@ class InputDocument(BaseModel):
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._init_doc(backend, path_or_stream) self._init_doc(backend, path_or_stream)
else:
raise RuntimeError(
f"Unexpected type path_or_stream: {type(path_or_stream)}"
)
# For paginated backends, check if the maximum page count is exceeded. # For paginated backends, check if the maximum page count is exceeded.
if self.valid and self._backend.is_valid(): if self.valid and self._backend.is_valid():
if self._backend.supports_pagination(): if self._backend.supports_pagination() and isinstance(
self._backend, PaginatedDocumentBackend
):
self.page_count = self._backend.page_count() self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages: if not self.page_count <= self.limits.max_num_pages:
self.valid = False self.valid = False
@ -150,7 +155,7 @@ class InputDocument(BaseModel):
def _init_doc( def _init_doc(
self, self,
backend: AbstractDocumentBackend, backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path], path_or_stream: Union[BytesIO, Path],
) -> None: ) -> None:
if backend is None: if backend is None:
@ -436,18 +441,23 @@ class ConversionResult(BaseModel):
return ds_doc return ds_doc
def render_element_images( def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,) self, element_types: Tuple[Type[PageElement]] = (FigureElement,)
): ):
for element in self.assembled.elements: for element in self.assembled.elements:
if isinstance(element, element_types): if isinstance(element, element_types):
page_ix = element.page_no page_ix = element.page_no
scale = self.pages[page_ix]._default_image_scale page = self.pages[page_ix]
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
page_height=self.pages[page_ix].size.height * scale
)
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple()) assert page.size is not None
yield element, cropped_im
scale = page._default_image_scale
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
page_height=page.size.height * scale
)
page_img = page.image
if page_img is not None:
cropped_im = page_img.crop(crop_bbox.as_tuple())
yield element, cropped_im
class _DocumentConversionInput(BaseModel): class _DocumentConversionInput(BaseModel):
@ -467,7 +477,7 @@ class _DocumentConversionInput(BaseModel):
) )
continue continue
else: else:
backend = format_options.get(format).backend backend = format_options[format].backend
if isinstance(obj, Path): if isinstance(obj, Path):
yield InputDocument( yield InputDocument(

View File

@ -161,6 +161,8 @@ class DocumentConverter:
def _convert( def _convert(
self, conv_input: _DocumentConversionInput, raises_on_error: bool self, conv_input: _DocumentConversionInput, raises_on_error: bool
) -> Iterator[ConversionResult]: ) -> Iterator[ConversionResult]:
assert self.format_to_options is not None
for input_batch in chunkify( for input_batch in chunkify(
conv_input.docs(self.format_to_options), conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options settings.perf.doc_batch_size, # pass format_options
@ -181,6 +183,8 @@ class DocumentConverter:
yield item yield item
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]: def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
assert self.format_to_options is not None
fopt = self.format_to_options.get(doc.format) fopt = self.format_to_options.get(doc.format)
if fopt is None: if fopt is None:
@ -189,6 +193,7 @@ class DocumentConverter:
pipeline_class = fopt.pipeline_cls pipeline_class = fopt.pipeline_cls
pipeline_options = fopt.pipeline_options pipeline_options = fopt.pipeline_options
assert pipeline_options is not None
# TODO this will ignore if different options have been defined for the same pipeline class. # TODO this will ignore if different options have been defined for the same pipeline class.
if ( if (
pipeline_class not in self.initialized_pipelines pipeline_class not in self.initialized_pipelines
@ -202,7 +207,9 @@ class DocumentConverter:
def process_document( def process_document(
self, in_doc: InputDocument, raises_on_error: bool self, in_doc: InputDocument, raises_on_error: bool
) -> ConversionResult: ) -> Optional[ConversionResult]:
assert self.allowed_formats is not None
if in_doc.format not in self.allowed_formats: if in_doc.format not in self.allowed_formats:
return None return None
else: else:
@ -217,7 +224,7 @@ class DocumentConverter:
def _execute_pipeline( def _execute_pipeline(
self, in_doc: InputDocument, raises_on_error: bool self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]: ) -> ConversionResult:
if in_doc.valid: if in_doc.valid:
pipeline = self._get_pipeline(in_doc) pipeline = self._get_pipeline(in_doc)
if pipeline is None: # Can't find a default pipeline. Should this raise? if pipeline is None: # Can't find a default pipeline. Should this raise?

View File

@ -21,8 +21,9 @@ class BaseOcrModel:
self.options = options self.options = options
# Computes the optimum amount and coordinates of rectangles to OCR on a given page # Computes the optimum amount and coordinates of rectangles to OCR on a given page
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]: def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
BITMAP_COVERAGE_TRESHOLD = 0.75 BITMAP_COVERAGE_TRESHOLD = 0.75
assert page.size is not None
def find_ocr_rects(size, bitmap_rects): def find_ocr_rects(size, bitmap_rects):
image = Image.new( image = Image.new(
@ -61,7 +62,10 @@ class BaseOcrModel:
return (area_frac, bounding_boxes) # fraction covered # boxes return (area_frac, bounding_boxes) # fraction covered # boxes
bitmap_rects = page._backend.get_bitmap_rects() if page._backend is not None:
bitmap_rects = page._backend.get_bitmap_rects()
else:
bitmap_rects = []
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
# return full-page rectangle if sufficiently covered with bitmaps # return full-page rectangle if sufficiently covered with bitmaps
@ -76,7 +80,7 @@ class BaseOcrModel:
) )
] ]
# return individual rectangles if the bitmap coverage is smaller # return individual rectangles if the bitmap coverage is smaller
elif coverage < BITMAP_COVERAGE_TRESHOLD: else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
return ocr_rects return ocr_rects
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell. # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.

View File

@ -1,6 +1,6 @@
import copy import copy
import random import random
from typing import Tuple from typing import List, Tuple, Union
from deepsearch_glm.nlp_utils import init_nlp_model from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import ( from deepsearch_glm.utils.doc_utils import (
@ -42,7 +42,7 @@ class GlmModel:
def _to_legacy_document(self, conv_res) -> DsDocument: def _to_legacy_document(self, conv_res) -> DsDocument:
title = "" title = ""
desc = DsDocumentDescription(logs=[]) desc: DsDocumentDescription = DsDocumentDescription(logs=[])
page_hashes = [ page_hashes = [
PageReference( PageReference(
@ -60,9 +60,9 @@ class GlmModel:
page_hashes=page_hashes, page_hashes=page_hashes,
) )
main_text = [] main_text: List[Union[Ref, BaseText]] = []
tables = [] tables: List[DsSchemaTable] = []
figures = [] figures: List[Figure] = []
page_no_to_page = {p.page_no: p for p in conv_res.pages} page_no_to_page = {p.page_no: p for p in conv_res.pages}
@ -146,11 +146,16 @@ class GlmModel:
yield [rspan, cspan] yield [rspan, cspan]
spans = list(make_spans(cell)) spans = list(make_spans(cell))
if cell.bbox is not None:
bbox = cell.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
else:
bbox = None
table_data[i][j] = TableCell( table_data[i][j] = TableCell(
text=cell.text, text=cell.text,
bbox=cell.bbox.to_bottom_left_origin( bbox=bbox,
page_no_to_page[element.page_no].size.height
).as_tuple(),
# col=j, # col=j,
# row=i, # row=i,
spans=spans, spans=spans,
@ -204,7 +209,7 @@ class GlmModel:
for p in conv_res.pages for p in conv_res.pages
] ]
ds_doc = DsDocument( ds_doc: DsDocument = DsDocument(
name=title, name=title,
description=desc, description=desc,
file_info=file_info, file_info=file_info,
@ -216,9 +221,7 @@ class GlmModel:
return ds_doc return ds_doc
def __call__( def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
self, conv_res: ConversionResult
) -> Tuple[DsLegacyDocument, DoclingDocument]:
ds_doc = self._to_legacy_document(conv_res) ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True) ds_doc_dict = ds_doc.model_dump(by_alias=True)

View File

@ -40,6 +40,8 @@ class EasyOcrModel(BaseOcrModel):
return return
for page in page_batch: for page in page_batch:
assert page._backend is not None
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []

View File

@ -47,7 +47,7 @@ class LayoutModel(BasePageModel):
def __init__(self, artifacts_path: Path): def __init__(self, artifacts_path: Path):
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height): def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2 MIN_INTERSECTION = 0.2
CLASS_THRESHOLDS = { CLASS_THRESHOLDS = {
DocItemLabel.CAPTION: 0.35, DocItemLabel.CAPTION: 0.35,
@ -78,9 +78,9 @@ class LayoutModel(BasePageModel):
start_time = time.time() start_time = time.time()
# Apply Confidence Threshold to cluster predictions # Apply Confidence Threshold to cluster predictions
# confidence = self.conf_threshold # confidence = self.conf_threshold
clusters_out = [] clusters_mod = []
for cluster in clusters: for cluster in clusters_in:
confidence = CLASS_THRESHOLDS[cluster.label] confidence = CLASS_THRESHOLDS[cluster.label]
if cluster.confidence >= confidence: if cluster.confidence >= confidence:
# annotation["created_by"] = "high_conf_pred" # annotation["created_by"] = "high_conf_pred"
@ -88,10 +88,10 @@ class LayoutModel(BasePageModel):
# Remap class labels where needed. # Remap class labels where needed.
if cluster.label in CLASS_REMAPPINGS.keys(): if cluster.label in CLASS_REMAPPINGS.keys():
cluster.label = CLASS_REMAPPINGS[cluster.label] cluster.label = CLASS_REMAPPINGS[cluster.label]
clusters_out.append(cluster) clusters_mod.append(cluster)
# map to dictionary clusters and cells, with bottom left origin # map to dictionary clusters and cells, with bottom left origin
clusters = [ clusters_orig = [
{ {
"id": c.id, "id": c.id,
"bbox": list( "bbox": list(
@ -101,7 +101,7 @@ class LayoutModel(BasePageModel):
"cell_ids": [], "cell_ids": [],
"type": c.label, "type": c.label,
} }
for c in clusters for c in clusters_in
] ]
clusters_out = [ clusters_out = [
@ -115,9 +115,11 @@ class LayoutModel(BasePageModel):
"cell_ids": [], "cell_ids": [],
"type": c.label, "type": c.label,
} }
for c in clusters_out for c in clusters_mod
] ]
del clusters_mod
raw_cells = [ raw_cells = [
{ {
"id": c.id, "id": c.id,
@ -151,7 +153,7 @@ class LayoutModel(BasePageModel):
# Assign orphan cells with lower confidence predictions # Assign orphan cells with lower confidence predictions
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred( clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
clusters_out, clusters, raw_cells, orphan_cell_indices clusters_out, clusters_orig, raw_cells, orphan_cell_indices
) )
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
@ -180,7 +182,7 @@ class LayoutModel(BasePageModel):
) = lu.cell_id_state_map(clusters_out, cell_count) ) = lu.cell_id_state_map(clusters_out, cell_count)
clusters_out, orphan_cell_indices = lu.set_orphan_as_text( clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
clusters_out, clusters, raw_cells, orphan_cell_indices clusters_out, clusters_orig, raw_cells, orphan_cell_indices
) )
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes") _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
@ -239,34 +241,41 @@ class LayoutModel(BasePageModel):
end_time = time.time() - start_time end_time = time.time() - start_time
_log.debug(f"Finished post processing in seconds={end_time:.3f}") _log.debug(f"Finished post processing in seconds={end_time:.3f}")
cells_out = [ cells_out_new = [
Cell( Cell(
id=c["id"], id=c["id"], # type: ignore
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
).to_top_left_origin(page_height), ).to_top_left_origin(page_height),
text=c["text"], text=c["text"], # type: ignore
) )
for c in cells_out for c in cells_out
] ]
del cells_out
clusters_out_new = [] clusters_out_new = []
for c in clusters_out: for c in clusters_out:
cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]] cluster_cells = [
ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
]
c_new = Cluster( c_new = Cluster(
id=c["id"], id=c["id"], # type: ignore
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
).to_top_left_origin(page_height), ).to_top_left_origin(page_height),
confidence=c["confidence"], confidence=c["confidence"], # type: ignore
label=DocItemLabel(c["type"]), label=DocItemLabel(c["type"]),
cells=cluster_cells, cells=cluster_cells,
) )
clusters_out_new.append(c_new) clusters_out_new.append(c_new)
return clusters_out_new, cells_out return clusters_out_new, cells_out_new
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
assert page.size is not None
clusters = [] clusters = []
for ix, pred_item in enumerate( for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0)) self.layout_predictor.predict(page.get_image(scale=1.0))

View File

@ -53,6 +53,8 @@ class PageAssembleModel(BasePageModel):
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
assert page._backend is not None
assert page.predictions.layout is not None
# assembles some JSON output page by page. # assembles some JSON output page by page.
elements: List[PageElement] = [] elements: List[PageElement] = []

View File

@ -40,7 +40,9 @@ class PagePreprocessingModel(BasePageModel):
# Extract and populate the page cells and store it in the page object # Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, page: Page) -> Page: def _parse_page_cells(self, page: Page) -> Page:
page.cells = page._backend.get_text_cells() assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
# DEBUG code: # DEBUG code:
def draw_text_boxes(image, cells): def draw_text_boxes(image, cells):

View File

@ -24,8 +24,6 @@ class TableStructureModel(BasePageModel):
self.enabled = enabled self.enabled = enabled
if self.enabled: if self.enabled:
artifacts_path: Path = artifacts_path
if self.mode == TableFormerMode.ACCURATE: if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat" artifacts_path = artifacts_path / "fat"
@ -40,6 +38,8 @@ class TableStructureModel(BasePageModel):
self.scale = 2.0 # Scale up table input images to 144 dpi self.scale = 2.0 # Scale up table input images to 144 dpi
def draw_table_and_cells(self, page: Page, tbl_list: List[Table]): def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
assert page._backend is not None
image = ( image = (
page._backend.get_page_image() page._backend.get_page_image()
) # make new image to avoid drawing on the saved ones ) # make new image to avoid drawing on the saved ones
@ -50,17 +50,18 @@ class TableStructureModel(BasePageModel):
draw.rectangle([(x0, y0), (x1, y1)], outline="red") draw.rectangle([(x0, y0), (x1, y1)], outline="red")
for tc in table_element.table_cells: for tc in table_element.table_cells:
x0, y0, x1, y1 = tc.bbox.as_tuple() if tc.bbox is not None:
if tc.column_header: x0, y0, x1, y1 = tc.bbox.as_tuple()
width = 3 if tc.column_header:
else: width = 3
width = 1 else:
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width) width = 1
draw.text( draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
(x0 + 3, y0 + 3), draw.text(
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}", (x0 + 3, y0 + 3),
fill="black", text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
) fill="black",
)
image.show() image.show()
@ -71,6 +72,9 @@ class TableStructureModel(BasePageModel):
return return
for page in page_batch: for page in page_batch:
assert page._backend is not None
assert page.predictions.layout is not None
assert page.size is not None
page.predictions.tablestructure = TableStructurePrediction() # dummy page.predictions.tablestructure = TableStructurePrediction() # dummy
@ -132,7 +136,7 @@ class TableStructureModel(BasePageModel):
element["bbox"]["token"] = text_piece element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element) tc = TableCell.model_validate(element)
if self.do_cell_matching: if self.do_cell_matching and tc.bbox is not None:
tc.bbox = tc.bbox.scaled(1 / self.scale) tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc) table_cells.append(tc)

View File

@ -2,7 +2,7 @@ import io
import logging import logging
import tempfile import tempfile
from subprocess import DEVNULL, PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Tuple from typing import Iterable, Optional, Tuple
import pandas as pd import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -22,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self._name = None self._name: Optional[str] = None
self._version = None self._version: Optional[str] = None
if self.enabled: if self.enabled:
try: try:
@ -40,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
def _get_name_and_version(self) -> Tuple[str, str]: def _get_name_and_version(self) -> Tuple[str, str]:
if self._name != None and self._version != None: if self._name != None and self._version != None:
return self._name, self._version return self._name, self._version # type: ignore
cmd = [self.options.tesseract_cmd, "--version"] cmd = [self.options.tesseract_cmd, "--version"]
@ -109,6 +109,8 @@ class TesseractOcrCliModel(BaseOcrModel):
return return
for page in page_batch: for page in page_batch:
assert page._backend is not None
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []

View File

@ -69,6 +69,9 @@ class TesseractOcrModel(BaseOcrModel):
return return
for page in page_batch: for page in page_batch:
assert page._backend is not None
assert self.reader is not None
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []

View File

@ -178,7 +178,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
) -> ConversionStatus: ) -> ConversionStatus:
status = ConversionStatus.SUCCESS status = ConversionStatus.SUCCESS
for page in conv_res.pages: for page in conv_res.pages:
if not page._backend.is_valid(): if page._backend is None or not page._backend.is_valid():
conv_res.errors.append( conv_res.errors.append(
ErrorItem( ErrorItem(
component_type=DoclingComponentType.DOCUMENT_BACKEND, component_type=DoclingComponentType.DOCUMENT_BACKEND,

View File

@ -123,8 +123,9 @@ class StandardPdfPipeline(PaginatedPipeline):
return None return None
def initialize_page(self, doc: InputDocument, page: Page) -> Page: def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no) page._backend = doc._backend.load_page(page.page_no) # type: ignore
page.size = page._backend.get_size() if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
return page return page
@ -136,7 +137,7 @@ class StandardPdfPipeline(PaginatedPipeline):
all_body = [] all_body = []
for p in conv_res.pages: for p in conv_res.pages:
assert p.assembled is not None
for el in p.assembled.body: for el in p.assembled.body:
all_body.append(el) all_body.append(el)
for el in p.assembled.headers: for el in p.assembled.headers:

View File

@ -1,369 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RAG with Docling and 🦜🔗 LangChain"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# requirements for this example:\n",
"%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loader and splitter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we set up:\n",
"- a `Loader` which will be used to create LangChain documents, and\n",
"- a splitter, which will be used to split these documents"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from enum import Enum\n",
"from typing import Iterator\n",
"\n",
"from langchain_core.document_loaders import BaseLoader\n",
"from langchain_core.documents import Document as LCDocument\n",
"from pydantic import BaseModel\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"\n",
"class DocumentMetadata(BaseModel):\n",
" dl_doc_hash: str\n",
" # source: str\n",
"\n",
"\n",
"class DoclingPDFLoader(BaseLoader):\n",
" class ParseType(str, Enum):\n",
" MARKDOWN = \"markdown\"\n",
" # JSON = \"json\"\n",
"\n",
" def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
" self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
" self._parse_type = parse_type\n",
" self._converter = DocumentConverter()\n",
"\n",
" def lazy_load(self) -> Iterator[LCDocument]:\n",
" for source in self._file_paths:\n",
" dl_doc = self._converter.convert_single(source).output\n",
" match self._parse_type:\n",
" case self.ParseType.MARKDOWN:\n",
" text = dl_doc.export_to_markdown()\n",
" # case self.ParseType.JSON:\n",
" # text = dl_doc.model_dump_json()\n",
" case _:\n",
" raise RuntimeError(\n",
" f\"Unexpected parse type encountered: {self._parse_type}\"\n",
" )\n",
" lc_doc = LCDocument(\n",
" page_content=text,\n",
" metadata=DocumentMetadata(\n",
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
" ).model_dump(),\n",
" )\n",
" yield lc_doc"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1b38d07d5fed4618a44ecf261e1e5c44",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"\n",
"loader = DoclingPDFLoader(\n",
" file_path=FILE_PATH,\n",
" parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
")\n",
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size=1000,\n",
" chunk_overlap=200,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now used the above-defined objects to get the document splits:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()\n",
"splits = text_splitter.split_documents(docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
"\n",
"HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
"embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Vector store"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from tempfile import TemporaryDirectory\n",
"\n",
"from langchain_milvus import Milvus\n",
"\n",
"MILVUS_URI = os.environ.get(\n",
" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
")\n",
"\n",
"vectorstore = Milvus.from_documents(\n",
" splits,\n",
" embeddings,\n",
" connection_args={\"uri\": MILVUS_URI},\n",
" drop_old=True,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### LLM"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
"Token is valid (permission: write).\n",
"Your token has been saved to /Users/pva/.cache/huggingface/token\n",
"Login successful\n"
]
}
],
"source": [
"from langchain_huggingface import HuggingFaceEndpoint\n",
"\n",
"HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
"HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
"\n",
"llm = HuggingFaceEndpoint(\n",
" repo_id=HF_LLM_MODEL_ID,\n",
" huggingfacehub_api_token=HF_API_KEY,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RAG"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from typing import Iterable\n",
"\n",
"from langchain_core.documents import Document as LCDocument\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.prompts import PromptTemplate\n",
"from langchain_core.runnables import RunnablePassthrough\n",
"\n",
"\n",
"def format_docs(docs: Iterable[LCDocument]):\n",
" return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
"\n",
"\n",
"retriever = vectorstore.as_retriever()\n",
"\n",
"prompt = PromptTemplate.from_template(\n",
" \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
")\n",
"\n",
"rag_chain = (\n",
" {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
" | prompt\n",
" | llm\n",
" | StrOutputParser()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,434 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/examples/rag_llamaindex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RAG with Docling and 🦙 LlamaIndex"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"LlamaIndex extensions `DoclingReader` and `DoclingNodeParser` presented in this notebook seamlessly integrate Docling into LlamaIndex, enabling you to:\n",
"- use PDF documents in your LLM applications with ease and speed, and\n",
"- leverage Docling's rich format for advanced, document-native grounding."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- 👉 For best conversion speed, use GPU acceleration whenever available; e.g. if running on Colab, use GPU-enabled runtime.\n",
"- Notebook uses HuggingFace's Inference API; for increased LLM quota, token can be provided via env var `HF_TOKEN`.\n",
"- Requirements can be installed as shown below (`--no-warn-conflicts` meant for Colab's pre-populated Python env; feel free to remove for stricter usage):"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-readers-file python-dotenv"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path\n",
"from tempfile import mkdtemp\n",
"from warnings import filterwarnings\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"\n",
"def _get_env_from_colab_or_os(key):\n",
" try:\n",
" from google.colab import userdata\n",
"\n",
" try:\n",
" return userdata.get(key)\n",
" except userdata.SecretNotFoundError:\n",
" pass\n",
" except ImportError:\n",
" pass\n",
" return os.getenv(key)\n",
"\n",
"\n",
"load_dotenv()\n",
"\n",
"filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic\")\n",
"filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n",
"# https://github.com/huggingface/transformers/issues/5486:\n",
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now define the main parameters:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
"\n",
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n",
"GEN_MODEL = HuggingFaceInferenceAPI(\n",
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
")\n",
"SOURCE = \"https://arxiv.org/pdf/2408.09869\" # Docling Technical Report\n",
"QUERY = \"Which are the main AI models in Docling?\"\n",
"\n",
"embed_dim = len(EMBED_MODEL.get_text_embedding(\"hi\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using Markdown export"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To create a simple RAG pipeline, we can:\n",
"- define a `DoclingReader`, which by default exports to Markdown, and\n",
"- use a standard node parser for these Markdown-based docs, e.g. a `MarkdownNodeParser`"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q: Which are the main AI models in Docling?\n",
"A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.\n",
"\n",
"Sources:\n"
]
},
{
"data": {
"text/plain": [
"[('3.2 AI models\\n\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n",
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
" 'Header_2': '3.2 AI models'}),\n",
" (\"5 Applications\\n\\nThanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.\",\n",
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
" 'Header_2': '5 Applications'})]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from llama_index.core import StorageContext, VectorStoreIndex\n",
"from llama_index.core.node_parser import MarkdownNodeParser\n",
"from llama_index.readers.docling import DoclingReader\n",
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
"\n",
"reader = DoclingReader()\n",
"node_parser = MarkdownNodeParser()\n",
"\n",
"vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
"index = VectorStoreIndex.from_documents(\n",
" documents=reader.load_data(SOURCE),\n",
" transformations=[node_parser],\n",
" storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
" embed_model=EMBED_MODEL,\n",
")\n",
"result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n",
"print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n",
"display([(n.text, n.metadata) for n in result.source_nodes])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using Docling format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To leverage Docling's rich native format, we:\n",
"- create a `DoclingReader` with JSON export type, and\n",
"- employ a `DoclingNodeParser` in order to appropriately parse that Docling format.\n",
"\n",
"Notice how the sources now also contain document-level grounding (e.g. page number or bounding box information):"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q: Which are the main AI models in Docling?\n",
"A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n",
"\n",
"Sources:\n"
]
},
{
"data": {
"text/plain": [
"[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n",
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
" 'path': '#/main-text/37',\n",
" 'heading': '3.2 AI models',\n",
" 'page': 3,\n",
" 'bbox': [107.36903381347656,\n",
" 330.07513427734375,\n",
" 506.29705810546875,\n",
" 407.3725280761719]}),\n",
" ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n",
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
" 'path': '#/main-text/10',\n",
" 'heading': '1 Introduction',\n",
" 'page': 1,\n",
" 'bbox': [107.33261108398438,\n",
" 83.3067626953125,\n",
" 504.0033874511719,\n",
" 136.45367431640625]})]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from llama_index.node_parser.docling import DoclingNodeParser\n",
"\n",
"reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)\n",
"node_parser = DoclingNodeParser()\n",
"\n",
"vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
"index = VectorStoreIndex.from_documents(\n",
" documents=reader.load_data(SOURCE),\n",
" transformations=[node_parser],\n",
" storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
" embed_model=EMBED_MODEL,\n",
")\n",
"result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n",
"print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n",
"display([(n.text, n.metadata) for n in result.source_nodes])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## With Simple Directory Reader"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To demonstrate this usage pattern, we first set up a test document directory."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from tempfile import mkdtemp\n",
"\n",
"import requests\n",
"\n",
"tmp_dir_path = Path(mkdtemp())\n",
"r = requests.get(SOURCE)\n",
"with open(tmp_dir_path / f\"{Path(SOURCE).name}.pdf\", \"wb\") as out_file:\n",
" out_file.write(r.content)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Using the `reader` and `node_parser` definitions from any of the above variants, usage with `SimpleDirectoryReader` then looks as follows:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading files: 100%|██████████| 1/1 [00:11<00:00, 11.15s/file]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q: Which are the main AI models in Docling?\n",
"A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n",
"\n",
"Sources:\n"
]
},
{
"data": {
"text/plain": [
"[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n",
" {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n",
" 'file_name': '2408.09869.pdf',\n",
" 'file_type': 'application/pdf',\n",
" 'file_size': 5566574,\n",
" 'creation_date': '2024-10-09',\n",
" 'last_modified_date': '2024-10-09',\n",
" 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
" 'path': '#/main-text/37',\n",
" 'heading': '3.2 AI models',\n",
" 'page': 3,\n",
" 'bbox': [107.36903381347656,\n",
" 330.07513427734375,\n",
" 506.29705810546875,\n",
" 407.3725280761719]}),\n",
" ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n",
" {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n",
" 'file_name': '2408.09869.pdf',\n",
" 'file_type': 'application/pdf',\n",
" 'file_size': 5566574,\n",
" 'creation_date': '2024-10-09',\n",
" 'last_modified_date': '2024-10-09',\n",
" 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
" 'path': '#/main-text/10',\n",
" 'heading': '1 Introduction',\n",
" 'page': 1,\n",
" 'bbox': [107.33261108398438,\n",
" 83.3067626953125,\n",
" 504.0033874511719,\n",
" 136.45367431640625]})]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from llama_index.core import SimpleDirectoryReader\n",
"\n",
"dir_reader = SimpleDirectoryReader(\n",
" input_dir=tmp_dir_path,\n",
" file_extractor={\".pdf\": reader},\n",
")\n",
"\n",
"vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
"index = VectorStoreIndex.from_documents(\n",
" documents=dir_reader.load_data(SOURCE),\n",
" transformations=[node_parser],\n",
" storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
" embed_model=EMBED_MODEL,\n",
")\n",
"result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n",
"print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n",
"display([(n.text, n.metadata) for n in result.source_nodes])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -114,6 +114,7 @@ py_version=311
pretty = true pretty = true
# strict = true # strict = true
no_implicit_optional = true no_implicit_optional = true
plugins = "pydantic.mypy"
python_version = "3.10" python_version = "3.10"
[[tool.mypy.overrides]] [[tool.mypy.overrides]]
@ -121,6 +122,15 @@ module = [
"docling_parse.*", "docling_parse.*",
"pypdfium2.*", "pypdfium2.*",
"networkx.*", "networkx.*",
"scipy.*",
"filetype.*",
"tesserocr.*",
"docling_ibm_models.*",
"easyocr.*",
"deepsearch_glm.*",
"lxml.*",
"bs4.*",
"huggingface_hub.*"
] ]
ignore_missing_imports = true ignore_missing_imports = true

View File

@ -26,7 +26,6 @@ def _get_backend(pdf_doc):
return doc_backend return doc_backend
@pytest.mark.skip
def test_text_cell_counts(): def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf") pdf_doc = Path("./tests/data/redp5695.pdf")