From 27f4ed36207fcd64ccd01a31af43df86ccac3a2a Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 15 Oct 2024 14:58:00 +0200 Subject: [PATCH] Enable mypy and fix many reported errors Signed-off-by: Christoph Auer --- .pre-commit-config.yaml | 12 +- docling/backend/docling_parse_v2_backend.py | 5 +- docling/backend/html_backend.py | 21 +- docling/backend/mspowerpoint_backend.py | 12 +- docling/backend/msword_backend.py | 27 +- docling/backend/pdf_backend.py | 2 +- docling/backend/pypdfium2_backend.py | 5 +- docling/cli/main.py | 5 +- docling/datamodel/base_models.py | 3 +- docling/datamodel/document.py | 64 +-- docling/document_converter.py | 11 +- docling/models/base_ocr_model.py | 10 +- docling/models/ds_glm_model.py | 27 +- docling/models/easyocr_model.py | 2 + docling/models/layout_model.py | 45 +- docling/models/page_assemble_model.py | 2 + docling/models/page_preprocessing_model.py | 4 +- docling/models/table_structure_model.py | 32 +- docling/models/tesseract_ocr_cli_model.py | 10 +- docling/models/tesseract_ocr_model.py | 3 + docling/pipeline/base_pipeline.py | 2 +- docling/pipeline/standard_pdf_pipeline.py | 7 +- examples/rag_langchain.ipynb | 369 ----------------- examples/rag_llamaindex.ipynb | 434 -------------------- pyproject.toml | 10 + tests/test_backend_docling_parse_v2.py | 1 - 26 files changed, 205 insertions(+), 920 deletions(-) delete mode 100644 examples/rag_langchain.ipynb delete mode 100644 examples/rag_llamaindex.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4baa518..6ab5c1aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,12 +20,12 @@ repos: # pass_filenames: false # language: system # files: '\.py$' -# - id: mypy -# name: MyPy -# entry: poetry run mypy docling -# pass_filenames: false -# language: system -# files: '\.py$' + - id: mypy + name: MyPy + entry: poetry run mypy docling + pass_filenames: false + language: system + files: '\.py$' - id: nbqa_black name: nbQA Black entry: poetry run nbqa black examples diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 239ea9af..bfc68f9b 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -2,7 +2,7 @@ import logging import random from io import BytesIO from pathlib import Path -from typing import Iterable, List, Optional, Union +from typing import TYPE_CHECKING, Iterable, List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin @@ -13,6 +13,9 @@ from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell, Size +if TYPE_CHECKING: + from docling.datamodel.document import InputDocument + _log = logging.getLogger(__name__) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index b9f7e630..7878d64f 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -30,10 +30,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # Initialise the parents for the hierarchy self.max_levels = 10 self.level = 0 - self.parents = {} + self.parents = {} # type: ignore for i in range(0, self.max_levels): self.parents[i] = None - self.labels = {} + self.labels = {} # type: ignore try: if isinstance(self.path_or_stream, BytesIO): @@ -49,8 +49,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) from e def is_valid(self) -> bool: - return True + return self.soup is not None + @classmethod def supports_pagination(cls) -> bool: return False @@ -68,11 +69,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # access self.path_or_stream to load stuff doc = DoclingDocument(description=DescriptionItem(), name="dummy") _log.debug("Trying to convert HTML...") - # Replace
tags with newline characters - for br in self.soup.body.find_all("br"): - br.replace_with("\n") - doc = self.walk(self.soup.body, doc) + if self.is_valid(): + assert self.soup is not None + # Replace
tags with newline characters + for br in self.soup.body.find_all("br"): + br.replace_with("\n") + doc = self.walk(self.soup.body, doc) + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) return doc def walk(self, element, doc): diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index bdf73c99..2455389a 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -42,7 +42,11 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB self.pptx_obj = None self.valid = False try: - self.pptx_obj = Presentation(self.path_or_stream) + if isinstance(self.path_or_stream, BytesIO): + self.pptx_obj = Presentation(self.path_or_stream) + elif isinstance(self.path_or_stream, Path): + self.pptx_obj = Presentation(str(self.path_or_stream)) + self.valid = True except Exception as e: raise RuntimeError( @@ -53,6 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB def page_count(self) -> int: if self.is_valid(): + assert self.pptx_obj is not None return len(self.pptx_obj.slides) else: return 0 @@ -60,6 +65,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB def is_valid(self) -> bool: return self.valid + @classmethod def supports_pagination(cls) -> bool: return True # True? if so, how to handle pages... @@ -311,10 +317,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB slide_width = pptx_obj.slide_width slide_height = pptx_obj.slide_height - text_content = [] + text_content = [] # type: ignore max_levels = 10 - parents = {} + parents = {} # type: ignore for i in range(0, max_levels): parents[i] = None diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index a098db51..7bf0946b 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -39,7 +39,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Initialise the parents for the hierarchy self.max_levels = 10 self.level_at_new_list = None - self.parents = {} + self.parents = {} # type: ignore for i in range(-1, self.max_levels): self.parents[i] = None @@ -54,16 +54,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.docx_obj = None try: - self.docx_obj = docx.Document(self.path_or_stream) + if isinstance(self.path_or_stream, BytesIO): + self.docx_obj = docx.Document(self.path_or_stream) + elif isinstance(self.path_or_stream, Path): + self.docx_obj = docx.Document(str(self.path_or_stream)) + self.valid = True except Exception as e: raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {document_hash}" + f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" ) from e def is_valid(self) -> bool: - return True + return self.valid + @classmethod def supports_pagination(cls) -> bool: return False @@ -80,10 +85,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # Parses the DOCX into a structured document model. doc = DoclingDocument(description=DescriptionItem(), name="dummy") - - # self.initialise() - doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) - return doc + if self.is_valid(): + assert self.docx_obj is not None + doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) + return doc + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) def update_history(self, name, level, numid, ilevel): self.history["names"].append(name) @@ -307,7 +316,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ): level = self.get_level() if self.prev_numid() is None: # Open new list - self.level_at_new_list = level + self.level_at_new_list = level # type: ignore self.parents[level] = doc.add_group( label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index 0bac725b..cd7a0815 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -1,9 +1,9 @@ from abc import ABC, abstractmethod from io import BytesIO +from pathlib import Path from typing import Iterable, Optional, Set, Union from docling_core.types.doc import BoundingBox, Size -from docling_core.types.legacy_doc.doc_ocr import Path from PIL import Image from docling.backend.abstract_backend import PaginatedDocumentBackend diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 5631a50d..4ab100a3 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -2,7 +2,7 @@ import logging import random from io import BytesIO from pathlib import Path -from typing import Iterable, List, Optional, Union +from typing import TYPE_CHECKING, Iterable, List, Optional, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c @@ -14,6 +14,9 @@ from pypdfium2._helpers.misc import PdfiumError from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell +if TYPE_CHECKING: + from docling.datamodel.document import InputDocument + _log = logging.getLogger(__name__) diff --git a/docling/cli/main.py b/docling/cli/main.py index a1f25bb0..6610cef2 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -21,6 +21,7 @@ from docling.datamodel.base_models import ( from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrOptions, PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, @@ -179,7 +180,7 @@ def convert( raise typer.Abort() elif source.is_dir(): for fmt in from_formats: - for ext in FormatToExtensions.get(fmt): + for ext in FormatToExtensions[fmt]: input_doc_paths.extend(list(source.glob(f"**/*.{ext}"))) input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}"))) else: @@ -195,7 +196,7 @@ def convert( match ocr_engine: case OcrEngine.EASYOCR: - ocr_options = EasyOcrOptions() + ocr_options: OcrOptions = EasyOcrOptions() case OcrEngine.TESSERACT_CLI: ocr_options = TesseractCliOcrOptions() case OcrEngine.TESSERACT: diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index c915e419..3a893fa1 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -126,7 +126,8 @@ class TableStructurePrediction(BaseModel): table_map: Dict[int, Table] = {} -class TextElement(BasePageElement): ... +class TextElement(BasePageElement): + text: str class FigureElement(BasePageElement): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index b8bab5fa..bcc0254e 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -3,7 +3,7 @@ import re from enum import Enum from io import BytesIO from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union import filetype from docling_core.types import BaseText @@ -29,7 +29,10 @@ from docling_core.utils.file import resolve_file_source from pydantic import BaseModel from typing_extensions import deprecated -from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.abstract_backend import ( + AbstractDocumentBackend, + PaginatedDocumentBackend, +) from docling.datamodel.base_models import ( AssembledUnit, ConversionStatus, @@ -70,41 +73,34 @@ layout_label_to_ds_type = { DocItemLabel.PARAGRAPH: "paragraph", } -_EMPTY_LEGACY_DOC = DsDocument( - _name="", - description=DsDocumentDescription(logs=[]), - file_info=DsFileInfoObject( - filename="", - document_hash="", - ), -) - _EMPTY_DOCLING_DOC = DoclingDocument( description=DescriptionItem(), name="dummy" ) # TODO: Stub class InputDocument(BaseModel): - file: PurePath = None - document_hash: Optional[str] = None + file: PurePath + document_hash: str # = None valid: bool = True limits: DocumentLimits = DocumentLimits() - format: Optional[InputFormat] = None + format: InputFormat # = None filesize: Optional[int] = None page_count: int = 0 - _backend: AbstractDocumentBackend = None # Internal PDF backend used + _backend: AbstractDocumentBackend # Internal PDF backend used def __init__( self, path_or_stream: Union[BytesIO, Path], format: InputFormat, - backend: AbstractDocumentBackend, + backend: Type[AbstractDocumentBackend], filename: Optional[str] = None, limits: Optional[DocumentLimits] = None, ): - super().__init__() + super().__init__( + file="", document_hash="", format=InputFormat.PDF + ) # initialize with dummy values self.limits = limits or DocumentLimits() self.format = format @@ -120,6 +116,9 @@ class InputDocument(BaseModel): self._init_doc(backend, path_or_stream) elif isinstance(path_or_stream, BytesIO): + assert ( + filename is not None + ), "Can't construct InputDocument from stream without providing filename arg." self.file = PurePath(filename) self.filesize = path_or_stream.getbuffer().nbytes @@ -128,10 +127,16 @@ class InputDocument(BaseModel): else: self.document_hash = create_file_hash(path_or_stream) self._init_doc(backend, path_or_stream) + else: + raise RuntimeError( + f"Unexpected type path_or_stream: {type(path_or_stream)}" + ) # For paginated backends, check if the maximum page count is exceeded. if self.valid and self._backend.is_valid(): - if self._backend.supports_pagination(): + if self._backend.supports_pagination() and isinstance( + self._backend, PaginatedDocumentBackend + ): self.page_count = self._backend.page_count() if not self.page_count <= self.limits.max_num_pages: self.valid = False @@ -150,7 +155,7 @@ class InputDocument(BaseModel): def _init_doc( self, - backend: AbstractDocumentBackend, + backend: Type[AbstractDocumentBackend], path_or_stream: Union[BytesIO, Path], ) -> None: if backend is None: @@ -436,18 +441,23 @@ class ConversionResult(BaseModel): return ds_doc def render_element_images( - self, element_types: Tuple[PageElement] = (FigureElement,) + self, element_types: Tuple[Type[PageElement]] = (FigureElement,) ): for element in self.assembled.elements: if isinstance(element, element_types): page_ix = element.page_no - scale = self.pages[page_ix]._default_image_scale - crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin( - page_height=self.pages[page_ix].size.height * scale - ) + page = self.pages[page_ix] - cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple()) - yield element, cropped_im + assert page.size is not None + + scale = page._default_image_scale + crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin( + page_height=page.size.height * scale + ) + page_img = page.image + if page_img is not None: + cropped_im = page_img.crop(crop_bbox.as_tuple()) + yield element, cropped_im class _DocumentConversionInput(BaseModel): @@ -467,7 +477,7 @@ class _DocumentConversionInput(BaseModel): ) continue else: - backend = format_options.get(format).backend + backend = format_options[format].backend if isinstance(obj, Path): yield InputDocument( diff --git a/docling/document_converter.py b/docling/document_converter.py index c3bc2dac..017a2096 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -161,6 +161,8 @@ class DocumentConverter: def _convert( self, conv_input: _DocumentConversionInput, raises_on_error: bool ) -> Iterator[ConversionResult]: + assert self.format_to_options is not None + for input_batch in chunkify( conv_input.docs(self.format_to_options), settings.perf.doc_batch_size, # pass format_options @@ -181,6 +183,8 @@ class DocumentConverter: yield item def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]: + assert self.format_to_options is not None + fopt = self.format_to_options.get(doc.format) if fopt is None: @@ -189,6 +193,7 @@ class DocumentConverter: pipeline_class = fopt.pipeline_cls pipeline_options = fopt.pipeline_options + assert pipeline_options is not None # TODO this will ignore if different options have been defined for the same pipeline class. if ( pipeline_class not in self.initialized_pipelines @@ -202,7 +207,9 @@ class DocumentConverter: def process_document( self, in_doc: InputDocument, raises_on_error: bool - ) -> ConversionResult: + ) -> Optional[ConversionResult]: + assert self.allowed_formats is not None + if in_doc.format not in self.allowed_formats: return None else: @@ -217,7 +224,7 @@ class DocumentConverter: def _execute_pipeline( self, in_doc: InputDocument, raises_on_error: bool - ) -> Optional[ConversionResult]: + ) -> ConversionResult: if in_doc.valid: pipeline = self._get_pipeline(in_doc) if pipeline is None: # Can't find a default pipeline. Should this raise? diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index bd293dcd..c1ca8ef7 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -21,8 +21,9 @@ class BaseOcrModel: self.options = options # Computes the optimum amount and coordinates of rectangles to OCR on a given page - def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]: + def get_ocr_rects(self, page: Page) -> List[BoundingBox]: BITMAP_COVERAGE_TRESHOLD = 0.75 + assert page.size is not None def find_ocr_rects(size, bitmap_rects): image = Image.new( @@ -61,7 +62,10 @@ class BaseOcrModel: return (area_frac, bounding_boxes) # fraction covered # boxes - bitmap_rects = page._backend.get_bitmap_rects() + if page._backend is not None: + bitmap_rects = page._backend.get_bitmap_rects() + else: + bitmap_rects = [] coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) # return full-page rectangle if sufficiently covered with bitmaps @@ -76,7 +80,7 @@ class BaseOcrModel: ) ] # return individual rectangles if the bitmap coverage is smaller - elif coverage < BITMAP_COVERAGE_TRESHOLD: + else: # coverage <= BITMAP_COVERAGE_TRESHOLD: return ocr_rects # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell. diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 80ee87c8..635651ad 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -1,6 +1,6 @@ import copy import random -from typing import Tuple +from typing import List, Tuple, Union from deepsearch_glm.nlp_utils import init_nlp_model from deepsearch_glm.utils.doc_utils import ( @@ -42,7 +42,7 @@ class GlmModel: def _to_legacy_document(self, conv_res) -> DsDocument: title = "" - desc = DsDocumentDescription(logs=[]) + desc: DsDocumentDescription = DsDocumentDescription(logs=[]) page_hashes = [ PageReference( @@ -60,9 +60,9 @@ class GlmModel: page_hashes=page_hashes, ) - main_text = [] - tables = [] - figures = [] + main_text: List[Union[Ref, BaseText]] = [] + tables: List[DsSchemaTable] = [] + figures: List[Figure] = [] page_no_to_page = {p.page_no: p for p in conv_res.pages} @@ -146,11 +146,16 @@ class GlmModel: yield [rspan, cspan] spans = list(make_spans(cell)) + if cell.bbox is not None: + bbox = cell.bbox.to_bottom_left_origin( + page_no_to_page[element.page_no].size.height + ).as_tuple() + else: + bbox = None + table_data[i][j] = TableCell( text=cell.text, - bbox=cell.bbox.to_bottom_left_origin( - page_no_to_page[element.page_no].size.height - ).as_tuple(), + bbox=bbox, # col=j, # row=i, spans=spans, @@ -204,7 +209,7 @@ class GlmModel: for p in conv_res.pages ] - ds_doc = DsDocument( + ds_doc: DsDocument = DsDocument( name=title, description=desc, file_info=file_info, @@ -216,9 +221,7 @@ class GlmModel: return ds_doc - def __call__( - self, conv_res: ConversionResult - ) -> Tuple[DsLegacyDocument, DoclingDocument]: + def __call__(self, conv_res: ConversionResult) -> DoclingDocument: ds_doc = self._to_legacy_document(conv_res) ds_doc_dict = ds_doc.model_dump(by_alias=True) diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 9e71fffc..0e02f11e 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -40,6 +40,8 @@ class EasyOcrModel(BaseOcrModel): return for page in page_batch: + assert page._backend is not None + ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 004be330..4572777d 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -47,7 +47,7 @@ class LayoutModel(BasePageModel): def __init__(self, artifacts_path: Path): self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary - def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height): + def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height): MIN_INTERSECTION = 0.2 CLASS_THRESHOLDS = { DocItemLabel.CAPTION: 0.35, @@ -78,9 +78,9 @@ class LayoutModel(BasePageModel): start_time = time.time() # Apply Confidence Threshold to cluster predictions # confidence = self.conf_threshold - clusters_out = [] + clusters_mod = [] - for cluster in clusters: + for cluster in clusters_in: confidence = CLASS_THRESHOLDS[cluster.label] if cluster.confidence >= confidence: # annotation["created_by"] = "high_conf_pred" @@ -88,10 +88,10 @@ class LayoutModel(BasePageModel): # Remap class labels where needed. if cluster.label in CLASS_REMAPPINGS.keys(): cluster.label = CLASS_REMAPPINGS[cluster.label] - clusters_out.append(cluster) + clusters_mod.append(cluster) # map to dictionary clusters and cells, with bottom left origin - clusters = [ + clusters_orig = [ { "id": c.id, "bbox": list( @@ -101,7 +101,7 @@ class LayoutModel(BasePageModel): "cell_ids": [], "type": c.label, } - for c in clusters + for c in clusters_in ] clusters_out = [ @@ -115,9 +115,11 @@ class LayoutModel(BasePageModel): "cell_ids": [], "type": c.label, } - for c in clusters_out + for c in clusters_mod ] + del clusters_mod + raw_cells = [ { "id": c.id, @@ -151,7 +153,7 @@ class LayoutModel(BasePageModel): # Assign orphan cells with lower confidence predictions clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred( - clusters_out, clusters, raw_cells, orphan_cell_indices + clusters_out, clusters_orig, raw_cells, orphan_cell_indices ) # Refresh the cell_ids assignment, after creating new clusters using low conf predictions @@ -180,7 +182,7 @@ class LayoutModel(BasePageModel): ) = lu.cell_id_state_map(clusters_out, cell_count) clusters_out, orphan_cell_indices = lu.set_orphan_as_text( - clusters_out, clusters, raw_cells, orphan_cell_indices + clusters_out, clusters_orig, raw_cells, orphan_cell_indices ) _log.debug("---- 5. Merge Cells & and adapt the bounding boxes") @@ -239,34 +241,41 @@ class LayoutModel(BasePageModel): end_time = time.time() - start_time _log.debug(f"Finished post processing in seconds={end_time:.3f}") - cells_out = [ + cells_out_new = [ Cell( - id=c["id"], + id=c["id"], # type: ignore bbox=BoundingBox.from_tuple( - coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT + coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore ).to_top_left_origin(page_height), - text=c["text"], + text=c["text"], # type: ignore ) for c in cells_out ] + + del cells_out + clusters_out_new = [] for c in clusters_out: - cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]] + cluster_cells = [ + ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore + ] c_new = Cluster( - id=c["id"], + id=c["id"], # type: ignore bbox=BoundingBox.from_tuple( - coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT + coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore ).to_top_left_origin(page_height), - confidence=c["confidence"], + confidence=c["confidence"], # type: ignore label=DocItemLabel(c["type"]), cells=cluster_cells, ) clusters_out_new.append(c_new) - return clusters_out_new, cells_out + return clusters_out_new, cells_out_new def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: + assert page.size is not None + clusters = [] for ix, pred_item in enumerate( self.layout_predictor.predict(page.get_image(scale=1.0)) diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 780cc89c..6bd55bf0 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -53,6 +53,8 @@ class PageAssembleModel(BasePageModel): def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: + assert page._backend is not None + assert page.predictions.layout is not None # assembles some JSON output page by page. elements: List[PageElement] = [] diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index fe079c45..7c71fd50 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -40,7 +40,9 @@ class PagePreprocessingModel(BasePageModel): # Extract and populate the page cells and store it in the page object def _parse_page_cells(self, page: Page) -> Page: - page.cells = page._backend.get_text_cells() + assert page._backend is not None + + page.cells = list(page._backend.get_text_cells()) # DEBUG code: def draw_text_boxes(image, cells): diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 5445a0b9..857dfc1e 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -24,8 +24,6 @@ class TableStructureModel(BasePageModel): self.enabled = enabled if self.enabled: - artifacts_path: Path = artifacts_path - if self.mode == TableFormerMode.ACCURATE: artifacts_path = artifacts_path / "fat" @@ -40,6 +38,8 @@ class TableStructureModel(BasePageModel): self.scale = 2.0 # Scale up table input images to 144 dpi def draw_table_and_cells(self, page: Page, tbl_list: List[Table]): + assert page._backend is not None + image = ( page._backend.get_page_image() ) # make new image to avoid drawing on the saved ones @@ -50,17 +50,18 @@ class TableStructureModel(BasePageModel): draw.rectangle([(x0, y0), (x1, y1)], outline="red") for tc in table_element.table_cells: - x0, y0, x1, y1 = tc.bbox.as_tuple() - if tc.column_header: - width = 3 - else: - width = 1 - draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width) - draw.text( - (x0 + 3, y0 + 3), - text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}", - fill="black", - ) + if tc.bbox is not None: + x0, y0, x1, y1 = tc.bbox.as_tuple() + if tc.column_header: + width = 3 + else: + width = 1 + draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width) + draw.text( + (x0 + 3, y0 + 3), + text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}", + fill="black", + ) image.show() @@ -71,6 +72,9 @@ class TableStructureModel(BasePageModel): return for page in page_batch: + assert page._backend is not None + assert page.predictions.layout is not None + assert page.size is not None page.predictions.tablestructure = TableStructurePrediction() # dummy @@ -132,7 +136,7 @@ class TableStructureModel(BasePageModel): element["bbox"]["token"] = text_piece tc = TableCell.model_validate(element) - if self.do_cell_matching: + if self.do_cell_matching and tc.bbox is not None: tc.bbox = tc.bbox.scaled(1 / self.scale) table_cells.append(tc) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 6ff0c35c..2c416d97 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -2,7 +2,7 @@ import io import logging import tempfile from subprocess import DEVNULL, PIPE, Popen -from typing import Iterable, Tuple +from typing import Iterable, Optional, Tuple import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin @@ -22,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel): self.scale = 3 # multiplier for 72 dpi == 216 dpi. - self._name = None - self._version = None + self._name: Optional[str] = None + self._version: Optional[str] = None if self.enabled: try: @@ -40,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel): def _get_name_and_version(self) -> Tuple[str, str]: if self._name != None and self._version != None: - return self._name, self._version + return self._name, self._version # type: ignore cmd = [self.options.tesseract_cmd, "--version"] @@ -109,6 +109,8 @@ class TesseractOcrCliModel(BaseOcrModel): return for page in page_batch: + assert page._backend is not None + ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index c31981be..ea74b6ad 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -69,6 +69,9 @@ class TesseractOcrModel(BaseOcrModel): return for page in page_batch: + assert page._backend is not None + assert self.reader is not None + ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index c903d569..5e26fe0c 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -178,7 +178,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. ) -> ConversionStatus: status = ConversionStatus.SUCCESS for page in conv_res.pages: - if not page._backend.is_valid(): + if page._backend is None or not page._backend.is_valid(): conv_res.errors.append( ErrorItem( component_type=DoclingComponentType.DOCUMENT_BACKEND, diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 6d72884d..3f008762 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -123,8 +123,9 @@ class StandardPdfPipeline(PaginatedPipeline): return None def initialize_page(self, doc: InputDocument, page: Page) -> Page: - page._backend = doc._backend.load_page(page.page_no) - page.size = page._backend.get_size() + page._backend = doc._backend.load_page(page.page_no) # type: ignore + if page._backend is not None and page._backend.is_valid(): + page.size = page._backend.get_size() return page @@ -136,7 +137,7 @@ class StandardPdfPipeline(PaginatedPipeline): all_body = [] for p in conv_res.pages: - + assert p.assembled is not None for el in p.assembled.body: all_body.append(el) for el in p.assembled.headers: diff --git a/examples/rag_langchain.ipynb b/examples/rag_langchain.ipynb deleted file mode 100644 index 30e38329..00000000 --- a/examples/rag_langchain.ipynb +++ /dev/null @@ -1,369 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RAG with Docling and 🦜🔗 LangChain" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# requirements for this example:\n", - "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", - "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loader and splitter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below we set up:\n", - "- a `Loader` which will be used to create LangChain documents, and\n", - "- a splitter, which will be used to split these documents" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from enum import Enum\n", - "from typing import Iterator\n", - "\n", - "from langchain_core.document_loaders import BaseLoader\n", - "from langchain_core.documents import Document as LCDocument\n", - "from pydantic import BaseModel\n", - "\n", - "from docling.document_converter import DocumentConverter\n", - "\n", - "\n", - "class DocumentMetadata(BaseModel):\n", - " dl_doc_hash: str\n", - " # source: str\n", - "\n", - "\n", - "class DoclingPDFLoader(BaseLoader):\n", - " class ParseType(str, Enum):\n", - " MARKDOWN = \"markdown\"\n", - " # JSON = \"json\"\n", - "\n", - " def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n", - " self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n", - " self._parse_type = parse_type\n", - " self._converter = DocumentConverter()\n", - "\n", - " def lazy_load(self) -> Iterator[LCDocument]:\n", - " for source in self._file_paths:\n", - " dl_doc = self._converter.convert_single(source).output\n", - " match self._parse_type:\n", - " case self.ParseType.MARKDOWN:\n", - " text = dl_doc.export_to_markdown()\n", - " # case self.ParseType.JSON:\n", - " # text = dl_doc.model_dump_json()\n", - " case _:\n", - " raise RuntimeError(\n", - " f\"Unexpected parse type encountered: {self._parse_type}\"\n", - " )\n", - " lc_doc = LCDocument(\n", - " page_content=text,\n", - " metadata=DocumentMetadata(\n", - " dl_doc_hash=dl_doc.file_info.document_hash,\n", - " ).model_dump(),\n", - " )\n", - " yield lc_doc" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1b38d07d5fed4618a44ecf261e1e5c44", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 7 files: 0%| | 0/7 [00:00\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RAG with Docling and 🦙 LlamaIndex" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "LlamaIndex extensions `DoclingReader` and `DoclingNodeParser` presented in this notebook seamlessly integrate Docling into LlamaIndex, enabling you to:\n", - "- use PDF documents in your LLM applications with ease and speed, and\n", - "- leverage Docling's rich format for advanced, document-native grounding." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- 👉 For best conversion speed, use GPU acceleration whenever available; e.g. if running on Colab, use GPU-enabled runtime.\n", - "- Notebook uses HuggingFace's Inference API; for increased LLM quota, token can be provided via env var `HF_TOKEN`.\n", - "- Requirements can be installed as shown below (`--no-warn-conflicts` meant for Colab's pre-populated Python env; feel free to remove for stricter usage):" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-readers-file python-dotenv" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pathlib import Path\n", - "from tempfile import mkdtemp\n", - "from warnings import filterwarnings\n", - "\n", - "from dotenv import load_dotenv\n", - "\n", - "\n", - "def _get_env_from_colab_or_os(key):\n", - " try:\n", - " from google.colab import userdata\n", - "\n", - " try:\n", - " return userdata.get(key)\n", - " except userdata.SecretNotFoundError:\n", - " pass\n", - " except ImportError:\n", - " pass\n", - " return os.getenv(key)\n", - "\n", - "\n", - "load_dotenv()\n", - "\n", - "filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic\")\n", - "filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n", - "# https://github.com/huggingface/transformers/issues/5486:\n", - "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now define the main parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", - "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", - "\n", - "EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n", - "MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n", - "GEN_MODEL = HuggingFaceInferenceAPI(\n", - " token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n", - " model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n", - ")\n", - "SOURCE = \"https://arxiv.org/pdf/2408.09869\" # Docling Technical Report\n", - "QUERY = \"Which are the main AI models in Docling?\"\n", - "\n", - "embed_dim = len(EMBED_MODEL.get_text_embedding(\"hi\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using Markdown export" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To create a simple RAG pipeline, we can:\n", - "- define a `DoclingReader`, which by default exports to Markdown, and\n", - "- use a standard node parser for these Markdown-based docs, e.g. a `MarkdownNodeParser`" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q: Which are the main AI models in Docling?\n", - "A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.\n", - "\n", - "Sources:\n" - ] - }, - { - "data": { - "text/plain": [ - "[('3.2 AI models\\n\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'Header_2': '3.2 AI models'}),\n", - " (\"5 Applications\\n\\nThanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.\",\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'Header_2': '5 Applications'})]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from llama_index.core import StorageContext, VectorStoreIndex\n", - "from llama_index.core.node_parser import MarkdownNodeParser\n", - "from llama_index.readers.docling import DoclingReader\n", - "from llama_index.vector_stores.milvus import MilvusVectorStore\n", - "\n", - "reader = DoclingReader()\n", - "node_parser = MarkdownNodeParser()\n", - "\n", - "vector_store = MilvusVectorStore(\n", - " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", - " dim=embed_dim,\n", - " overwrite=True,\n", - ")\n", - "index = VectorStoreIndex.from_documents(\n", - " documents=reader.load_data(SOURCE),\n", - " transformations=[node_parser],\n", - " storage_context=StorageContext.from_defaults(vector_store=vector_store),\n", - " embed_model=EMBED_MODEL,\n", - ")\n", - "result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n", - "print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n", - "display([(n.text, n.metadata) for n in result.source_nodes])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using Docling format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To leverage Docling's rich native format, we:\n", - "- create a `DoclingReader` with JSON export type, and\n", - "- employ a `DoclingNodeParser` in order to appropriately parse that Docling format.\n", - "\n", - "Notice how the sources now also contain document-level grounding (e.g. page number or bounding box information):" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q: Which are the main AI models in Docling?\n", - "A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n", - "\n", - "Sources:\n" - ] - }, - { - "data": { - "text/plain": [ - "[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/37',\n", - " 'heading': '3.2 AI models',\n", - " 'page': 3,\n", - " 'bbox': [107.36903381347656,\n", - " 330.07513427734375,\n", - " 506.29705810546875,\n", - " 407.3725280761719]}),\n", - " ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n", - " {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/10',\n", - " 'heading': '1 Introduction',\n", - " 'page': 1,\n", - " 'bbox': [107.33261108398438,\n", - " 83.3067626953125,\n", - " 504.0033874511719,\n", - " 136.45367431640625]})]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from llama_index.node_parser.docling import DoclingNodeParser\n", - "\n", - "reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)\n", - "node_parser = DoclingNodeParser()\n", - "\n", - "vector_store = MilvusVectorStore(\n", - " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", - " dim=embed_dim,\n", - " overwrite=True,\n", - ")\n", - "index = VectorStoreIndex.from_documents(\n", - " documents=reader.load_data(SOURCE),\n", - " transformations=[node_parser],\n", - " storage_context=StorageContext.from_defaults(vector_store=vector_store),\n", - " embed_model=EMBED_MODEL,\n", - ")\n", - "result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n", - "print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n", - "display([(n.text, n.metadata) for n in result.source_nodes])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## With Simple Directory Reader" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To demonstrate this usage pattern, we first set up a test document directory." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from tempfile import mkdtemp\n", - "\n", - "import requests\n", - "\n", - "tmp_dir_path = Path(mkdtemp())\n", - "r = requests.get(SOURCE)\n", - "with open(tmp_dir_path / f\"{Path(SOURCE).name}.pdf\", \"wb\") as out_file:\n", - " out_file.write(r.content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the `reader` and `node_parser` definitions from any of the above variants, usage with `SimpleDirectoryReader` then looks as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading files: 100%|██████████| 1/1 [00:11<00:00, 11.15s/file]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q: Which are the main AI models in Docling?\n", - "A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n", - "\n", - "Sources:\n" - ] - }, - { - "data": { - "text/plain": [ - "[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n", - " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n", - " 'file_name': '2408.09869.pdf',\n", - " 'file_type': 'application/pdf',\n", - " 'file_size': 5566574,\n", - " 'creation_date': '2024-10-09',\n", - " 'last_modified_date': '2024-10-09',\n", - " 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/37',\n", - " 'heading': '3.2 AI models',\n", - " 'page': 3,\n", - " 'bbox': [107.36903381347656,\n", - " 330.07513427734375,\n", - " 506.29705810546875,\n", - " 407.3725280761719]}),\n", - " ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n", - " {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n", - " 'file_name': '2408.09869.pdf',\n", - " 'file_type': 'application/pdf',\n", - " 'file_size': 5566574,\n", - " 'creation_date': '2024-10-09',\n", - " 'last_modified_date': '2024-10-09',\n", - " 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n", - " 'path': '#/main-text/10',\n", - " 'heading': '1 Introduction',\n", - " 'page': 1,\n", - " 'bbox': [107.33261108398438,\n", - " 83.3067626953125,\n", - " 504.0033874511719,\n", - " 136.45367431640625]})]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from llama_index.core import SimpleDirectoryReader\n", - "\n", - "dir_reader = SimpleDirectoryReader(\n", - " input_dir=tmp_dir_path,\n", - " file_extractor={\".pdf\": reader},\n", - ")\n", - "\n", - "vector_store = MilvusVectorStore(\n", - " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", - " dim=embed_dim,\n", - " overwrite=True,\n", - ")\n", - "index = VectorStoreIndex.from_documents(\n", - " documents=dir_reader.load_data(SOURCE),\n", - " transformations=[node_parser],\n", - " storage_context=StorageContext.from_defaults(vector_store=vector_store),\n", - " embed_model=EMBED_MODEL,\n", - ")\n", - "result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n", - "print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n", - "display([(n.text, n.metadata) for n in result.source_nodes])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pyproject.toml b/pyproject.toml index 92225813..12e485d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,6 +114,7 @@ py_version=311 pretty = true # strict = true no_implicit_optional = true +plugins = "pydantic.mypy" python_version = "3.10" [[tool.mypy.overrides]] @@ -121,6 +122,15 @@ module = [ "docling_parse.*", "pypdfium2.*", "networkx.*", + "scipy.*", + "filetype.*", + "tesserocr.*", + "docling_ibm_models.*", + "easyocr.*", + "deepsearch_glm.*", + "lxml.*", + "bs4.*", + "huggingface_hub.*" ] ignore_missing_imports = true diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index 8c4252cb..f27ecb43 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -26,7 +26,6 @@ def _get_backend(pdf_doc): return doc_backend -@pytest.mark.skip def test_text_cell_counts(): pdf_doc = Path("./tests/data/redp5695.pdf")