Merge branch 'main' into rtdl/export_latex_docx

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
Rafael Teixeira de Lima 2025-01-27 12:31:59 +01:00 committed by GitHub
commit 30d0afe137
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
65 changed files with 3379 additions and 300 deletions

View File

@ -1,3 +1,25 @@
## [v2.16.0](https://github.com/DS4SD/docling/releases/tag/v2.16.0) - 2025-01-24
### Feature
* New document picture classifier ([#805](https://github.com/DS4SD/docling/issues/805)) ([`16a218d`](https://github.com/DS4SD/docling/commit/16a218d871c48fd9cc636b77f7b597dc40cbeeec))
* Add Docling JSON ingestion ([#783](https://github.com/DS4SD/docling/issues/783)) ([`88a0e66`](https://github.com/DS4SD/docling/commit/88a0e66adc19238f57a942b0504926cdaeacd8cc))
* Code and equation model for PDF and code blocks in markdown ([#752](https://github.com/DS4SD/docling/issues/752)) ([`3213b24`](https://github.com/DS4SD/docling/commit/3213b247ad6870ff984271f09f7720be68d9479b))
* Add "auto" language for TesseractOcr ([#759](https://github.com/DS4SD/docling/issues/759)) ([`8543c22`](https://github.com/DS4SD/docling/commit/8543c22687fee40459d393bf4adcfc059712de02))
### Fix
* Added extraction of byte-images in excel ([#804](https://github.com/DS4SD/docling/issues/804)) ([`a458e29`](https://github.com/DS4SD/docling/commit/a458e298ca64da2c6df29d953e95645525817bed))
* Update docling-parse-v2 backend version with new parsing fixes ([#769](https://github.com/DS4SD/docling/issues/769)) ([`670a08b`](https://github.com/DS4SD/docling/commit/670a08bdedda847ff3b6942bcaa1a2adef79afe2))
### Documentation
* Fix minor typos ([#801](https://github.com/DS4SD/docling/issues/801)) ([`c58f75d`](https://github.com/DS4SD/docling/commit/c58f75d0f75040e32820cc2915ec00755211c02f))
* Add Azure RAG example ([#675](https://github.com/DS4SD/docling/issues/675)) ([`9020a93`](https://github.com/DS4SD/docling/commit/9020a934be35b0798c972eb77a22fb62ce654ca5))
* Fix links between docs pages ([#697](https://github.com/DS4SD/docling/issues/697)) ([`c49b352`](https://github.com/DS4SD/docling/commit/c49b3526fb7b72e8007f785b1fcfdf58c2457756))
* Fix correct Accelerator pipeline options in docs/examples/custom_convert.py ([#733](https://github.com/DS4SD/docling/issues/733)) ([`7686083`](https://github.com/DS4SD/docling/commit/768608351d40376c3504546f52e967195536b3d5))
* Example to translate documents ([#739](https://github.com/DS4SD/docling/issues/739)) ([`f7e1cbf`](https://github.com/DS4SD/docling/commit/f7e1cbf629ae5f3e279296e72f656b7a453ab7a3))
## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10 ## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10
### Fix ### Fix

View File

@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
pass pass
@abstractmethod
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close() self.path_or_stream.close()

View File

@ -24,7 +24,6 @@ _log = logging.getLogger(__name__)
class AsciiDocBackend(DeclarativeDocumentBackend): class AsciiDocBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)

View File

@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
label = DocItemLabel.CODE label = DocItemLabel.CODE
if len(text) == 0: if len(text) == 0:
return return
doc.add_text(parent=self.parents[self.level], label=label, text=text) doc.add_code(parent=self.parents[self.level], label=label, text=text)
def handle_paragraph(self, element, idx, doc): def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p).""" """Handles paragraph tags (p)."""

View File

View File

@ -0,0 +1,58 @@
from io import BytesIO
from pathlib import Path
from typing import Union
from docling_core.types.doc import DoclingDocument
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
class DoclingJSONBackend(DeclarativeDocumentBackend):
@override
def __init__(
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)
# given we need to store any actual conversion exception for raising it from
# convert(), this captures the successful result or the actual error in a
# mutually exclusive way:
self._doc_or_err = self._get_doc_or_err()
@override
def is_valid(self) -> bool:
return isinstance(self._doc_or_err, DoclingDocument)
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.JSON_DOCLING}
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
try:
json_data: Union[str, bytes]
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
json_data = f.read()
elif isinstance(self.path_or_stream, BytesIO):
json_data = self.path_or_stream.getvalue()
else:
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
return DoclingDocument.model_validate_json(json_data=json_data)
except Exception as e:
return e
@override
def convert(self) -> DoclingDocument:
if isinstance(self._doc_or_err, DoclingDocument):
return self._doc_or_err
else:
raise self._doc_or_err

View File

@ -3,19 +3,22 @@ import re
import warnings import warnings
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import List, Optional, Set, Union
import marko import marko
import marko.ext import marko.ext
import marko.ext.gfm import marko.ext.gfm
import marko.inline import marko.inline
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
GroupLabel, GroupLabel,
NodeItem,
TableCell, TableCell,
TableData, TableData,
TextItem,
) )
from marko import Markdown from marko import Markdown
@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
class MarkdownDocumentBackend(DeclarativeDocumentBackend): class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
def shorten_underscore_sequences(self, markdown_text, max_length=10):
# This regex will match any sequence of underscores # This regex will match any sequence of underscores
pattern = r"_+" pattern = r"_+"
@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e ) from e
return return
def close_table(self, doc=None): def close_table(self, doc: DoclingDocument):
if self.in_table: if self.in_table:
_log.debug("=== TABLE START ===") _log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer: for md_table_row in self.md_table_buffer:
_log.debug(md_table_row) _log.debug(md_table_row)
_log.debug("=== TABLE END ===") _log.debug("=== TABLE END ===")
tcells = [] tcells: List[TableCell] = []
result_table = [] result_table = []
for n, md_table_row in enumerate(self.md_table_buffer): for n, md_table_row in enumerate(self.md_table_buffer):
data = [] data = []
@ -137,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False self.in_table = False
self.md_table_buffer = [] # clean table markdown buffer self.md_table_buffer = [] # clean table markdown buffer
# Initialize Docling TableData # Initialize Docling TableData
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells) table_data = TableData(
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
)
# Populate # Populate
for tcell in tcells: for tcell in tcells:
data.table_cells.append(tcell) table_data.table_cells.append(tcell)
if len(tcells) > 0: if len(tcells) > 0:
doc.add_table(data=data) doc.add_table(data=table_data)
return return
def process_inline_text(self, parent_element, doc=None): def process_inline_text(
self, parent_element: Optional[NodeItem], doc: DoclingDocument
):
# self.inline_text_buffer += str(text_in) # self.inline_text_buffer += str(text_in)
txt = self.inline_text_buffer.strip() txt = self.inline_text_buffer.strip()
if len(txt) > 0: if len(txt) > 0:
@ -156,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
self.inline_text_buffer = "" self.inline_text_buffer = ""
def iterate_elements(self, element, depth=0, doc=None, parent_element=None): def iterate_elements(
self,
element: marko.block.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
):
# Iterates over all elements in the AST # Iterates over all elements in the AST
# Check for different element types and process relevant details # Check for different element types and process relevant details
if isinstance(element, marko.block.Heading): if isinstance(element, marko.block.Heading):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug( _log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
) )
if element.level == 1: if element.level == 1:
doc_label = DocItemLabel.TITLE doc_label = DocItemLabel.TITLE
@ -172,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Header could have arbitrary inclusion of bold, italic or emphasis, # Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header # hence we need to traverse the tree to get full text of a header
strings = [] strings: List[str] = []
# Define a recursive function to traverse the tree # Define a recursive function to traverse the tree
def traverse(node): def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute # Check if the node has a "children" attribute
if hasattr(node, "children"): if hasattr(node, "children"):
# If "children" is a list, continue traversal # If "children" is a list, continue traversal
@ -209,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(" - List item") _log.debug(" - List item")
snippet_text = str(element.children[0].children[0].children) snippet_text = str(element.children[0].children[0].children) # type: ignore
is_numbered = False is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST: if (
parent_element is not None
and isinstance(parent_element, DocItem)
and parent_element.label == GroupLabel.ORDERED_LIST
):
is_numbered = True is_numbered = True
doc.add_list_item( doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text enumerated=is_numbered, parent=parent_element, text=snippet_text
@ -221,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}") _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(parent=parent_element, caption=element.title)
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
)
doc.add_picture(parent=parent_element, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph): elif isinstance(element, marko.block.Paragraph):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
@ -252,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Span: {element.children}") _log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip() snippet_text = str(element.children).strip()
doc.add_text( doc.add_code(parent=parent_element, text=snippet_text)
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.block.CodeBlock): elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_text( doc.add_code(parent=parent_element, text=snippet_text)
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.block.FencedCode): elif isinstance(element, marko.block.FencedCode):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_text( doc.add_code(parent=parent_element, text=snippet_text)
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
elif isinstance(element, marko.inline.LineBreak): elif isinstance(element, marko.inline.LineBreak):
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)

View File

@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
from typing import Any, List from typing import Any, List
from PIL import Image as PILImage
from pydantic import BaseModel from pydantic import BaseModel
@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
class MsExcelDocumentBackend(DeclarativeDocumentBackend): class MsExcelDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
@ -326,24 +326,23 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
self, doc: DoclingDocument, sheet: Worksheet self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument: ) -> DoclingDocument:
# FIXME: mypy does not agree with _images ... # Iterate over byte images in the sheet
""" for idx, image in enumerate(sheet._images): # type: ignore
# Iterate over images in the sheet
for idx, image in enumerate(sheet._images): # Access embedded images
image_bytes = BytesIO(image.ref.blob) try:
pil_image = Image.open(image_bytes) pil_image = PILImage.open(image.ref)
doc.add_picture( doc.add_picture(
parent=self.parents[0], parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72), image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None, caption=None,
) )
""" except:
_log.error("could not extract the image from excel sheets")
# FIXME: mypy does not agree with _charts ...
""" """
for idx, chart in enumerate(sheet._charts): # Access embedded charts for idx, chart in enumerate(sheet._charts): # type: ignore
try:
chart_path = f"chart_{idx + 1}.png" chart_path = f"chart_{idx + 1}.png"
_log.info( _log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}" f"Chart found, but dynamic rendering is required for: {chart_path}"
@ -352,23 +351,36 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
_log.info(f"Chart {idx + 1}:") _log.info(f"Chart {idx + 1}:")
# Chart type # Chart type
_log.info(f"Type: {type(chart).__name__}") # _log.info(f"Type: {type(chart).__name__}")
print(f"Type: {type(chart).__name__}")
# Title # Extract series data
if chart.title: for series_idx, series in enumerate(chart.series):
_log.info(f"Title: {chart.title}") #_log.info(f"Series {series_idx + 1}:")
else: print(f"Series {series_idx + 1} type: {type(series).__name__}")
_log.info("No title") #print(f"x-values: {series.xVal}")
#print(f"y-values: {series.yVal}")
# Data series print(f"xval type: {type(series.xVal).__name__}")
for series in chart.series:
_log.info(" => series ...")
_log.info(f"Data Series: {series.title}")
_log.info(f"Values: {series.values}")
_log.info(f"Categories: {series.categories}")
# Position xvals = []
# _log.info(f"Anchor Cell: {chart.anchor}") for _ in series.xVal.numLit.pt:
print(f"xval type: {type(_).__name__}")
if hasattr(_, 'v'):
xvals.append(_.v)
print(f"x-values: {xvals}")
yvals = []
for _ in series.yVal:
if hasattr(_, 'v'):
yvals.append(_.v)
print(f"y-values: {yvals}")
except Exception as exc:
print(exc)
continue
""" """
return doc return doc

View File

@ -27,7 +27,6 @@ _log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend): class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
self.XML_KEY = ( self.XML_KEY = (

View File

@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC): class PdfPageBackend(ABC):
@abstractmethod @abstractmethod
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
pass pass
@ -45,7 +44,6 @@ class PdfPageBackend(ABC):
class PdfDocumentBackend(PaginatedDocumentBackend): class PdfDocumentBackend(PaginatedDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)

View File

@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
if name == self.Element.TITLE.value: if name == self.Element.TITLE.value:
if text: if text:
self.parents[self.level + 1] = self.doc.add_title( self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
text=text, text=text,
) )
self.level += 1 self.level += 1
@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
abstract_item = self.doc.add_heading( abstract_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
claims_item = self.doc.add_heading( claims_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
for text in self.claims: for text in self.claims:
self.doc.add_text( self.doc.add_text(
@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=text, text=text,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.text = "" self.text = ""
@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
text=text, text=text,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
self.text = "" self.text = ""
@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[]) empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table( self.doc.add_table(
data=empty_table, data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def _apply_style(self, text: str, style_tag: str) -> str: def _apply_style(self, text: str, style_tag: str) -> str:
@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
if self.Element.TITLE.value in self.property and text.strip(): if self.Element.TITLE.value in self.property and text.strip():
title = text.strip() title = text.strip()
self.parents[self.level + 1] = self.doc.add_title( self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
text=title, text=title,
) )
self.level += 1 self.level += 1
@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
text=text.strip(), text=text.strip(),
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
claims_item = self.doc.add_heading( claims_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
for text in self.claims: for text in self.claims:
self.doc.add_text( self.doc.add_text(
@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
abstract_item = self.doc.add_heading( abstract_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=paragraph, text=paragraph,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
elif self.Element.CLAIM.value in self.property: elif self.Element.CLAIM.value in self.property:
# we may need a space after a paragraph in claim text # we may need a space after a paragraph in claim text
@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[]) empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table( self.doc.add_table(
data=empty_table, data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def _apply_style(self, text: str, style_tag: str) -> str: def _apply_style(self, text: str, style_tag: str) -> str:
@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
heading.value, heading.value,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
if field == self.Field.TITLE.value: if field == self.Field.TITLE.value:
self.parents[self.level + 1] = self.doc.add_title( self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], text=value # type: ignore[arg-type] parent=self.parents[self.level], text=value
) )
self.level += 1 self.level += 1
@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=value, text=value,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value: elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text="", text="",
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
elif ( elif (
@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
last_claim = self.doc.add_text( last_claim = self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text="", text="",
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
last_claim.text += f" {value}" if last_claim.text else value last_claim.text += f" {value}" if last_claim.text else value
@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
value, value,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=value, text=value,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def parse(self, patent_content: str) -> Optional[DoclingDocument]: def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
title = text.strip() title = text.strip()
if title: if title:
self.parents[self.level + 1] = self.doc.add_text( self.parents[self.level + 1] = self.doc.add_text(
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
label=DocItemLabel.TITLE, label=DocItemLabel.TITLE,
text=title, text=title,
) )
@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
abstract_item = self.doc.add_heading( abstract_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
claims_item = self.doc.add_heading( claims_item = self.doc.add_heading(
heading_text, heading_text,
level=heading_level, level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type] parent=self.parents[heading_level],
) )
for text in self.claims: for text in self.claims:
self.doc.add_text( self.doc.add_text(
@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
text=text, text=text,
level=self.level, level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.level += 1 self.level += 1
else: else:
self.doc.add_text( self.doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
text=text, text=text,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
self.text = "" self.text = ""
@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[]) empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table( self.doc.add_table(
data=empty_table, data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type] parent=self.parents[self.level],
) )
def _apply_style(self, text: str, style_tag: str) -> str: def _apply_style(self, text: str, style_tag: str) -> str:

View File

@ -41,6 +41,7 @@ class InputFormat(str, Enum):
MD = "md" MD = "md"
XLSX = "xlsx" XLSX = "xlsx"
XML_USPTO = "xml_uspto" XML_USPTO = "xml_uspto"
JSON_DOCLING = "json_docling"
class OutputFormat(str, Enum): class OutputFormat(str, Enum):
@ -62,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"], InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
} }
FormatToMimeType: Dict[InputFormat, List[str]] = { FormatToMimeType: Dict[InputFormat, List[str]] = {
@ -90,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
], ],
InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
} }
MimeTypeToFormat: dict[str, list[InputFormat]] = { MimeTypeToFormat: dict[str, list[InputFormat]] = {

View File

@ -350,6 +350,8 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.HTML][0] mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]: elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0] mime = FormatToMimeType[InputFormat.MD][0]
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
return mime return mime
@staticmethod @staticmethod

View File

@ -1,17 +1,11 @@
import logging import logging
import os import os
import warnings
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union from typing import Any, List, Literal, Optional, Union
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pydantic import BaseModel, ConfigDict, Field, model_validator
from pydantic_settings import ( from pydantic_settings import BaseSettings, SettingsConfigDict
BaseSettings,
PydanticBaseSettingsSource,
SettingsConfigDict,
)
from typing_extensions import deprecated
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -225,6 +219,9 @@ class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[ ocr_options: Union[

View File

@ -11,6 +11,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.PDF: FormatOption( InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
), ),
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
} }
if (options := format_to_default_options.get(format)) is not None: if (options := format_to_default_options.get(format)) is not None:
return options return options

View File

@ -1,7 +1,7 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional from typing import Any, Generic, Iterable, Optional
from docling_core.types.doc import DoclingDocument, NodeItem, TextItem from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
from typing_extensions import TypeVar from typing_extensions import TypeVar
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
@ -53,6 +53,7 @@ class BaseItemAndImageEnrichmentModel(
): ):
images_scale: float images_scale: float
expansion_factor: float = 0.0
def prepare_element( def prepare_element(
self, conv_res: ConversionResult, element: NodeItem self, conv_res: ConversionResult, element: NodeItem
@ -62,8 +63,22 @@ class BaseItemAndImageEnrichmentModel(
assert isinstance(element, TextItem) assert isinstance(element, TextItem)
element_prov = element.prov[0] element_prov = element.prov[0]
bbox = element_prov.bbox
width = bbox.r - bbox.l
height = bbox.t - bbox.b
# TODO: move to a utility in the BoundingBox class
expanded_bbox = BoundingBox(
l=bbox.l - width * self.expansion_factor,
t=bbox.t + height * self.expansion_factor,
r=bbox.r + width * self.expansion_factor,
b=bbox.b - height * self.expansion_factor,
coord_origin=bbox.coord_origin,
)
page_ix = element_prov.page_no - 1 page_ix = element_prov.page_no - 1
cropped_image = conv_res.pages[page_ix].get_image( cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=element_prov.bbox scale=self.images_scale, cropbox=expanded_bbox
) )
return ItemAndImageEnrichmentElement(item=element, image=cropped_image) return ItemAndImageEnrichmentElement(item=element, image=cropped_image)

View File

@ -0,0 +1,245 @@
import re
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union
from docling_core.types.doc import (
CodeItem,
DocItemLabel,
DoclingDocument,
NodeItem,
TextItem,
)
from docling_core.types.doc.labels import CodeLanguageLabel
from PIL import Image
from pydantic import BaseModel
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.models.base_model import BaseItemAndImageEnrichmentModel
from docling.utils.accelerator_utils import decide_device
class CodeFormulaModelOptions(BaseModel):
"""
Configuration options for the CodeFormulaModel.
Attributes
----------
kind : str
Type of the model. Fixed value "code_formula".
do_code_enrichment : bool
True if code enrichment is enabled, False otherwise.
do_formula_enrichment : bool
True if formula enrichment is enabled, False otherwise.
"""
kind: Literal["code_formula"] = "code_formula"
do_code_enrichment: bool = True
do_formula_enrichment: bool = True
class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
"""
Model for processing and enriching documents with code and formula predictions.
Attributes
----------
enabled : bool
True if the model is enabled, False otherwise.
options : CodeFormulaModelOptions
Configuration options for the CodeFormulaModel.
code_formula_model : CodeFormulaPredictor
The predictor model for code and formula processing.
Methods
-------
__init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
Initializes the CodeFormulaModel with the given configuration options.
is_processable(self, doc, element)
Determines if a given element in a document can be processed by the model.
__call__(self, doc, element_batch)
Processes the given batch of elements and enriches them with predictions.
"""
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
expansion_factor = 0.03
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Union[Path, str]],
options: CodeFormulaModelOptions,
accelerator_options: AcceleratorOptions,
):
"""
Initializes the CodeFormulaModel with the given configuration.
Parameters
----------
enabled : bool
True if the model is enabled, False otherwise.
artifacts_path : Path
Path to the directory containing the model artifacts.
options : CodeFormulaModelOptions
Configuration options for the model.
accelerator_options : AcceleratorOptions
Options specifying the device and number of threads for acceleration.
"""
self.enabled = enabled
self.options = options
if self.enabled:
device = decide_device(accelerator_options.device)
from docling_ibm_models.code_formula_model.code_formula_predictor import (
CodeFormulaPredictor,
)
if artifacts_path is None:
artifacts_path = self.download_models_hf()
else:
artifacts_path = Path(artifacts_path)
self.code_formula_model = CodeFormulaPredictor(
artifacts_path=artifacts_path,
device=device,
num_threads=accelerator_options.num_threads,
)
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/CodeFormula",
force_download=force,
local_dir=local_dir,
revision="v1.0.0",
)
return Path(download_path)
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
"""
Determines if a given element in a document can be processed by the model.
Parameters
----------
doc : DoclingDocument
The document being processed.
element : NodeItem
The element within the document to check.
Returns
-------
bool
True if the element can be processed, False otherwise.
"""
return self.enabled and (
(isinstance(element, CodeItem) and self.options.do_code_enrichment)
or (
isinstance(element, TextItem)
and element.label == DocItemLabel.FORMULA
and self.options.do_formula_enrichment
)
)
def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
"""Extracts a programming language from the beginning of a string.
This function checks if the input string starts with a pattern of the form
``<_some_language_>``. If it does, it extracts the language string and returns
a tuple of (remainder, language). Otherwise, it returns the original string
and `None`.
Args:
input_string (str): The input string, which may start with ``<_language_>``.
Returns:
Tuple[str, Optional[str]]:
A tuple where:
- The first element is either:
- The remainder of the string (everything after ``<_language_>``),
if a match is found; or
- The original string, if no match is found.
- The second element is the extracted language if a match is found;
otherwise, `None`.
"""
pattern = r"^<_([^>]+)_>\s*(.*)"
match = re.match(pattern, input_string, flags=re.DOTALL)
if match:
language = str(match.group(1)) # the captured programming language
remainder = str(match.group(2)) # everything after the <_language_>
return remainder, language
else:
return input_string, None
def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
"""
Converts a string to a corresponding `CodeLanguageLabel` enum member.
If the provided string does not match any value in `CodeLanguageLabel`,
it defaults to `CodeLanguageLabel.UNKNOWN`.
Args:
value (Optional[str]): The string representation of the code language or None.
Returns:
CodeLanguageLabel: The corresponding enum member if the value is valid,
otherwise `CodeLanguageLabel.UNKNOWN`.
"""
if not isinstance(value, str):
return CodeLanguageLabel.UNKNOWN
try:
return CodeLanguageLabel(value)
except ValueError:
return CodeLanguageLabel.UNKNOWN
def __call__(
self,
doc: DoclingDocument,
element_batch: Iterable[ItemAndImageEnrichmentElement],
) -> Iterable[NodeItem]:
"""
Processes the given batch of elements and enriches them with predictions.
Parameters
----------
doc : DoclingDocument
The document being processed.
element_batch : Iterable[ItemAndImageEnrichmentElement]
A batch of elements to be processed.
Returns
-------
Iterable[Any]
An iterable of enriched elements.
"""
if not self.enabled:
for element in element_batch:
yield element.item
return
labels: List[str] = []
images: List[Image.Image] = []
elements: List[TextItem] = []
for el in element_batch:
assert isinstance(el.item, TextItem)
elements.append(el.item)
labels.append(el.item.label)
images.append(el.image)
outputs = self.code_formula_model.predict(images, labels)
for item, output in zip(elements, outputs):
if isinstance(item, CodeItem):
output, code_language = self._extract_code_language(output)
item.code_language = self._get_code_language_enum(code_language)
item.text = output
yield item

View File

@ -0,0 +1,187 @@
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union
from docling_core.types.doc import (
DoclingDocument,
NodeItem,
PictureClassificationClass,
PictureClassificationData,
PictureItem,
)
from PIL import Image
from pydantic import BaseModel
from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.accelerator_utils import decide_device
class DocumentPictureClassifierOptions(BaseModel):
"""
Options for configuring the DocumentPictureClassifier.
Attributes
----------
kind : Literal["document_picture_classifier"]
Identifier for the type of classifier.
"""
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
class DocumentPictureClassifier(BaseEnrichmentModel):
"""
A model for classifying pictures in documents.
This class enriches document pictures with predicted classifications
based on a predefined set of classes.
Attributes
----------
enabled : bool
Whether the classifier is enabled for use.
options : DocumentPictureClassifierOptions
Configuration options for the classifier.
document_picture_classifier : DocumentPictureClassifierPredictor
The underlying prediction model, loaded if the classifier is enabled.
Methods
-------
__init__(enabled, artifacts_path, options, accelerator_options)
Initializes the classifier with specified configurations.
is_processable(doc, element)
Checks if the given element can be processed by the classifier.
__call__(doc, element_batch)
Processes a batch of elements and adds classification annotations.
"""
images_scale = 2
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Union[Path, str]],
options: DocumentPictureClassifierOptions,
accelerator_options: AcceleratorOptions,
):
"""
Initializes the DocumentPictureClassifier.
Parameters
----------
enabled : bool
Indicates whether the classifier is enabled.
artifacts_path : Optional[Union[Path, str]],
Path to the directory containing model artifacts.
options : DocumentPictureClassifierOptions
Configuration options for the classifier.
accelerator_options : AcceleratorOptions
Options for configuring the device and parallelism.
"""
self.enabled = enabled
self.options = options
if self.enabled:
device = decide_device(accelerator_options.device)
from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
DocumentFigureClassifierPredictor,
)
if artifacts_path is None:
artifacts_path = self.download_models_hf()
else:
artifacts_path = Path(artifacts_path)
self.document_picture_classifier = DocumentFigureClassifierPredictor(
artifacts_path=artifacts_path,
device=device,
num_threads=accelerator_options.num_threads,
)
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/DocumentFigureClassifier",
force_download=force,
local_dir=local_dir,
revision="v1.0.0",
)
return Path(download_path)
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
"""
Determines if the given element can be processed by the classifier.
Parameters
----------
doc : DoclingDocument
The document containing the element.
element : NodeItem
The element to be checked.
Returns
-------
bool
True if the element is a PictureItem and processing is enabled; False otherwise.
"""
return self.enabled and isinstance(element, PictureItem)
def __call__(
self,
doc: DoclingDocument,
element_batch: Iterable[NodeItem],
) -> Iterable[NodeItem]:
"""
Processes a batch of elements and enriches them with classification predictions.
Parameters
----------
doc : DoclingDocument
The document containing the elements to be processed.
element_batch : Iterable[NodeItem]
A batch of pictures to classify.
Returns
-------
Iterable[NodeItem]
An iterable of NodeItem objects after processing. The field
'data.classification' is added containing the classification for each picture.
"""
if not self.enabled:
for element in element_batch:
yield element
return
images: List[Image.Image] = []
elements: List[PictureItem] = []
for el in element_batch:
assert isinstance(el, PictureItem)
elements.append(el)
img = el.get_image(doc)
assert img is not None
images.append(img)
outputs = self.document_picture_classifier.predict(images)
for element, output in zip(elements, outputs):
element.annotations.append(
PictureClassificationData(
provenance="DocumentPictureClassifier",
predicted_classes=[
PictureClassificationClass(
class_name=pred[0],
confidence=pred[1],
)
for pred in output
],
)
)
yield element

View File

@ -1,28 +1,21 @@
import copy import copy
import logging import logging
import random
import time
from pathlib import Path from pathlib import Path
from typing import Iterable, List from typing import Iterable
from docling_core.types.doc import CoordOrigin, DocItemLabel from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import Image, ImageDraw, ImageFont from PIL import Image
from docling.datamodel.base_models import ( from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
BoundingBox,
Cell,
Cluster,
LayoutPrediction,
Page,
)
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.pipeline_options import AcceleratorOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
from docling.utils.visualization import draw_clusters
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -40,7 +33,7 @@ class LayoutModel(BasePageModel):
DocItemLabel.PAGE_FOOTER, DocItemLabel.PAGE_FOOTER,
DocItemLabel.CODE, DocItemLabel.CODE,
DocItemLabel.LIST_ITEM, DocItemLabel.LIST_ITEM,
# "Formula", DocItemLabel.FORMULA,
] ]
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER] PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
@ -82,78 +75,9 @@ class LayoutModel(BasePageModel):
left_image = copy.deepcopy(page.image) left_image = copy.deepcopy(page.image)
right_image = copy.deepcopy(page.image) right_image = copy.deepcopy(page.image)
# Function to draw clusters on an image
def draw_clusters(image, clusters):
draw = ImageDraw.Draw(image, "RGBA")
# Create a smaller font for the labels
try:
font = ImageFont.truetype("arial.ttf", 12)
except OSError:
# Fallback to default font if arial is not available
font = ImageFont.load_default()
for c_tl in clusters:
all_clusters = [c_tl, *c_tl.children]
for c in all_clusters:
# Draw cells first (underneath)
cell_color = (0, 0, 0, 40) # Transparent black for cells
for tc in c.cells:
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
cx0 *= scale_x
cx1 *= scale_x
cy0 *= scale_x
cy1 *= scale_y
draw.rectangle(
[(cx0, cy0), (cx1, cy1)],
outline=None,
fill=cell_color,
)
# Draw cluster rectangle
x0, y0, x1, y1 = c.bbox.as_tuple()
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
cluster_outline_color = (
*list(DocItemLabel.get_color(c.label)),
255,
)
draw.rectangle(
[(x0, y0), (x1, y1)],
outline=cluster_outline_color,
fill=cluster_fill_color,
)
# Add label name and confidence
label_text = f"{c.label.name} ({c.confidence:.2f})"
# Create semi-transparent background for text
text_bbox = draw.textbbox((x0, y0), label_text, font=font)
text_bg_padding = 2
draw.rectangle(
[
(
text_bbox[0] - text_bg_padding,
text_bbox[1] - text_bg_padding,
),
(
text_bbox[2] + text_bg_padding,
text_bbox[3] + text_bg_padding,
),
],
fill=(255, 255, 255, 180), # Semi-transparent white
)
# Draw text
draw.text(
(x0, y0),
label_text,
fill=(0, 0, 0, 255), # Solid black
font=font,
)
# Draw clusters on both images # Draw clusters on both images
draw_clusters(left_image, left_clusters) draw_clusters(left_image, left_clusters, scale_x, scale_y)
draw_clusters(right_image, right_clusters) draw_clusters(right_image, right_clusters, scale_x, scale_y)
# Combine the images side by side # Combine the images side by side
combined_width = left_image.width * 2 combined_width = left_image.width * 2
combined_height = left_image.height combined_height = left_image.height

View File

@ -135,31 +135,6 @@ class PageAssembleModel(BasePageModel):
) )
elements.append(fig) elements.append(fig)
body.append(fig) body.append(fig)
elif cluster.label == LayoutModel.FORMULA_LABEL:
equation = None
if page.predictions.equations_prediction:
equation = page.predictions.equations_prediction.equation_map.get(
cluster.id, None
)
if (
not equation
): # fallback: add empty formula, if it isn't present
text = self.sanitize_text(
[
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
)
equation = TextElement(
label=cluster.label,
id=cluster.id,
cluster=cluster,
page_no=page.page_no,
text=text,
)
elements.append(equation)
body.append(equation)
elif cluster.label in LayoutModel.CONTAINER_LABELS: elif cluster.label in LayoutModel.CONTAINER_LABELS:
container_el = ContainerElement( container_el = ContainerElement(
label=cluster.label, label=cluster.label,

View File

@ -4,7 +4,7 @@ import logging
import os import os
import tempfile import tempfile
from subprocess import DEVNULL, PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Optional, Tuple from typing import Iterable, List, Optional, Tuple
import pandas as pd import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -14,13 +14,13 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class TesseractOcrCliModel(BaseOcrModel): class TesseractOcrCliModel(BaseOcrModel):
def __init__(self, enabled: bool, options: TesseractCliOcrOptions): def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
super().__init__(enabled=enabled, options=options) super().__init__(enabled=enabled, options=options)
self.options: TesseractCliOcrOptions self.options: TesseractCliOcrOptions
@ -29,10 +29,13 @@ class TesseractOcrCliModel(BaseOcrModel):
self._name: Optional[str] = None self._name: Optional[str] = None
self._version: Optional[str] = None self._version: Optional[str] = None
self._tesseract_languages: Optional[List[str]] = None
self._script_prefix: Optional[str] = None
if self.enabled: if self.enabled:
try: try:
self._get_name_and_version() self._get_name_and_version()
self._set_languages_and_prefix()
except Exception as exc: except Exception as exc:
raise RuntimeError( raise RuntimeError(
@ -74,12 +77,20 @@ class TesseractOcrCliModel(BaseOcrModel):
return name, version return name, version
def _run_tesseract(self, ifilename: str): def _run_tesseract(self, ifilename: str):
r"""
Run tesseract CLI
"""
cmd = [self.options.tesseract_cmd] cmd = [self.options.tesseract_cmd]
if self.options.lang is not None and len(self.options.lang) > 0: if "auto" in self.options.lang:
lang = self._detect_language(ifilename)
if lang is not None:
cmd.append("-l")
cmd.append(lang)
elif self.options.lang is not None and len(self.options.lang) > 0:
cmd.append("-l") cmd.append("-l")
cmd.append("+".join(self.options.lang)) cmd.append("+".join(self.options.lang))
if self.options.path is not None: if self.options.path is not None:
cmd.append("--tessdata-dir") cmd.append("--tessdata-dir")
cmd.append(self.options.path) cmd.append(self.options.path)
@ -107,6 +118,63 @@ class TesseractOcrCliModel(BaseOcrModel):
return df_filtered return df_filtered
def _detect_language(self, ifilename: str):
r"""
Run tesseract in PSM 0 mode to detect the language
"""
assert self._tesseract_languages is not None
cmd = [self.options.tesseract_cmd]
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df.loc[df["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None
script = map_tesseract_script(scripts[0].strip())
lang = f"{self._script_prefix}{script}"
# Check if the detected language has been installed
if lang not in self._tesseract_languages:
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
msg += " However this language is not installed in your system and will be ignored."
_log.warning(msg)
return None
_log.debug(
f"Using tesseract model for the detected script '{script}' and language '{lang}'"
)
return lang
def _set_languages_and_prefix(self):
r"""
Read and set the languages installed in tesseract and decide the script prefix
"""
# Get all languages
cmd = [self.options.tesseract_cmd]
cmd.append("--list-langs")
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df[0].tolist()[1:]
# Decide the script prefix
if any([l.startswith("script/") for l in self._tesseract_languages]):
script_prefix = "script/"
else:
script_prefix = ""
self._script_prefix = script_prefix
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
@ -121,7 +189,6 @@ class TesseractOcrCliModel(BaseOcrModel):
yield page yield page
else: else:
with TimeRecorder(conv_res, "ocr"): with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []

View File

@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -20,6 +21,7 @@ class TesseractOcrModel(BaseOcrModel):
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None self.reader = None
self.osd_reader = None
if self.enabled: if self.enabled:
install_errmsg = ( install_errmsg = (
@ -47,8 +49,8 @@ class TesseractOcrModel(BaseOcrModel):
except: except:
raise ImportError(install_errmsg) raise ImportError(install_errmsg)
_, tesserocr_languages = tesserocr.get_languages() _, self._tesserocr_languages = tesserocr.get_languages()
if not tesserocr_languages: if not self._tesserocr_languages:
raise ImportError(missing_langs_errmsg) raise ImportError(missing_langs_errmsg)
# Initialize the tesseractAPI # Initialize the tesseractAPI
@ -57,7 +59,7 @@ class TesseractOcrModel(BaseOcrModel):
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if any([l.startswith("script/") for l in tesserocr_languages]): if any([l.startswith("script/") for l in self._tesserocr_languages]):
self.script_prefix = "script/" self.script_prefix = "script/"
else: else:
self.script_prefix = "" self.script_prefix = ""
@ -72,14 +74,14 @@ class TesseractOcrModel(BaseOcrModel):
tesserocr_kwargs["path"] = self.options.path tesserocr_kwargs["path"] = self.options.path
if lang == "auto": if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI( self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
) )
else: else:
self.reader = tesserocr.PyTessBaseAPI( self.reader = tesserocr.PyTessBaseAPI(
**{"lang": lang} | tesserocr_kwargs, **{"lang": lang} | tesserocr_kwargs,
) )
self.reader_RIL = tesserocr.RIL self.reader_RIL = tesserocr.RIL
def __del__(self): def __del__(self):
@ -96,8 +98,6 @@ class TesseractOcrModel(BaseOcrModel):
yield from page_batch yield from page_batch
return return
import tesserocr
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid(): if not page._backend.is_valid():
@ -105,6 +105,7 @@ class TesseractOcrModel(BaseOcrModel):
else: else:
with TimeRecorder(conv_res, "ocr"): with TimeRecorder(conv_res, "ocr"):
assert self.reader is not None assert self.reader is not None
assert self._tesserocr_languages is not None
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
@ -117,43 +118,42 @@ class TesseractOcrModel(BaseOcrModel):
scale=self.scale, cropbox=ocr_rect scale=self.scale, cropbox=ocr_rect
) )
# Retrieve text snippets with their bounding boxes local_reader = self.reader
self.reader.SetImage(high_res_image) if "auto" in self.options.lang:
assert self.osd_reader is not None
if self.options.lang == ["auto"]: self.osd_reader.SetImage(high_res_image)
osd = self.reader.DetectOrientationScript() osd = self.osd_reader.DetectOrientationScript()
# No text, probably # No text, probably
if osd is None: if osd is None:
continue continue
script = osd["script_name"] script = osd["script_name"]
script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"
if script == "Katakana" or script == "Hiragana": # Check if the detected languge is present in the system
script = "Japanese" if lang not in self._tesserocr_languages:
elif script == "Han": msg = f"Tesseract detected the script '{script}' and language '{lang}'."
script = "HanS" msg += " However this language is not installed in your system and will be ignored."
elif script == "Korean": _log.warning(msg)
script = "Hangul" else:
_log.debug(
f'Using model for the detected script "{script}"'
)
if script not in self.script_readers: if script not in self.script_readers:
self.script_readers[script] = tesserocr.PyTessBaseAPI( import tesserocr
self.script_readers[script] = (
tesserocr.PyTessBaseAPI(
path=self.reader.GetDatapath(), path=self.reader.GetDatapath(),
lang=f"{self.script_prefix}{script}", lang=lang,
psm=tesserocr.PSM.AUTO, psm=tesserocr.PSM.AUTO,
init=True, init=True,
oem=tesserocr.OEM.DEFAULT, oem=tesserocr.OEM.DEFAULT,
) )
)
local_reader = self.script_readers[script] local_reader = self.script_readers[script]
local_reader.SetImage(high_res_image)
else:
local_reader = self.reader
local_reader.SetImage(high_res_image)
boxes = local_reader.GetComponentImages( boxes = local_reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True self.reader_RIL.TEXTLINE, True
) )

View File

@ -3,7 +3,7 @@ import logging
import time import time
import traceback import traceback
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Callable, Iterable, List from typing import Any, Callable, Iterable, List
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import DoclingDocument, NodeItem
@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel from docling.models.base_model import GenericEnrichmentModel
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify from docling.utils.utils import chunkify
@ -30,7 +30,7 @@ class BasePipeline(ABC):
self.pipeline_options = pipeline_options self.pipeline_options = pipeline_options
self.keep_images = False self.keep_images = False
self.build_pipe: List[Callable] = [] self.build_pipe: List[Callable] = []
self.enrichment_pipe: List[BaseEnrichmentModel] = [] self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult: def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(input=in_doc)
@ -66,7 +66,7 @@ class BasePipeline(ABC):
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult: def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
def _prepare_elements( def _prepare_elements(
conv_res: ConversionResult, model: BaseEnrichmentModel conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
) -> Iterable[NodeItem]: ) -> Iterable[NodeItem]:
for doc_element, _level in conv_res.document.iterate_items(): for doc_element, _level in conv_res.document.iterate_items():
prepared_element = model.prepare_element( prepared_element = model.prepare_element(

View File

@ -1,7 +1,7 @@
import logging import logging
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional from typing import Optional
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
@ -17,8 +17,12 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.models.base_model import BasePageModel
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
from docling.models.document_picture_classifier import (
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
@ -93,8 +97,32 @@ class StandardPdfPipeline(PaginatedPipeline):
self.enrichment_pipe = [ self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument # Other models working on `NodeItem` elements in the DoclingDocument
# Code Formula Enrichment Model
CodeFormulaModel(
enabled=pipeline_options.do_code_enrichment
or pipeline_options.do_formula_enrichment,
artifacts_path=pipeline_options.artifacts_path,
options=CodeFormulaModelOptions(
do_code_enrichment=pipeline_options.do_code_enrichment,
do_formula_enrichment=pipeline_options.do_formula_enrichment,
),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture Classifier
DocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification,
artifacts_path=pipeline_options.artifacts_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
),
] ]
if (
self.pipeline_options.do_formula_enrichment
or self.pipeline_options.do_code_enrichment
):
self.keep_backend = True
@staticmethod @staticmethod
def download_models_hf( def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False local_dir: Optional[Path] = None, force: bool = False

View File

@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
container_el = doc.add_group(label=group_label) container_el = doc.add_group(label=group_label)
_add_child_elements(container_el, doc, obj, pelem) _add_child_elements(container_el, doc, obj, pelem)
elif "text" in obj: elif "text" in obj:
text = obj["text"][span_i:span_j] text = obj["text"][span_i:span_j]
@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
current_list = None current_list = None
doc.add_heading(text=text, prov=prov) doc.add_heading(text=text, prov=prov)
elif label == DocItemLabel.CODE:
current_list = None
doc.add_code(text=text, prov=prov)
else: else:
current_list = None current_list = None

View File

@ -0,0 +1,9 @@
def map_tesseract_script(script: str) -> str:
r""" """
if script == "Katakana" or script == "Hiragana":
script = "Japanese"
elif script == "Han":
script = "HanS"
elif script == "Korean":
script = "Hangul"
return script

View File

@ -0,0 +1,80 @@
from docling_core.types.doc import DocItemLabel
from PIL import Image, ImageDraw, ImageFont
from PIL.ImageFont import FreeTypeFont
from docling.datamodel.base_models import Cluster
def draw_clusters(
image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
) -> None:
"""
Draw clusters on an image
"""
draw = ImageDraw.Draw(image, "RGBA")
# Create a smaller font for the labels
font: ImageFont.ImageFont | FreeTypeFont
try:
font = ImageFont.truetype("arial.ttf", 12)
except OSError:
# Fallback to default font if arial is not available
font = ImageFont.load_default()
for c_tl in clusters:
all_clusters = [c_tl, *c_tl.children]
for c in all_clusters:
# Draw cells first (underneath)
cell_color = (0, 0, 0, 40) # Transparent black for cells
for tc in c.cells:
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
cx0 *= scale_x
cx1 *= scale_x
cy0 *= scale_x
cy1 *= scale_y
draw.rectangle(
[(cx0, cy0), (cx1, cy1)],
outline=None,
fill=cell_color,
)
# Draw cluster rectangle
x0, y0, x1, y1 = c.bbox.as_tuple()
x0 *= scale_x
x1 *= scale_x
y0 *= scale_x
y1 *= scale_y
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
cluster_outline_color = (
*list(DocItemLabel.get_color(c.label)),
255,
)
draw.rectangle(
[(x0, y0), (x1, y1)],
outline=cluster_outline_color,
fill=cluster_fill_color,
)
# Add label name and confidence
label_text = f"{c.label.name} ({c.confidence:.2f})"
# Create semi-transparent background for text
text_bbox = draw.textbbox((x0, y0), label_text, font=font)
text_bg_padding = 2
draw.rectangle(
[
(
text_bbox[0] - text_bg_padding,
text_bbox[1] - text_bg_padding,
),
(
text_bbox[2] + text_bg_padding,
text_bbox[3] + text_bg_padding,
),
],
fill=(255, 255, 255, 180), # Semi-transparent white
)
# Draw text
draw.text(
(x0, y0),
label_text,
fill=(0, 0, 0, 255), # Solid black
font=font,
)

File diff suppressed because it is too large Load Diff

View File

@ -22,7 +22,6 @@ class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel): class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
def __init__(self, enabled: bool): def __init__(self, enabled: bool):
self.enabled = enabled self.enabled = enabled
@ -54,7 +53,6 @@ class ExamplePictureClassifierEnrichmentModel(BaseEnrichmentModel):
class ExamplePictureClassifierPipeline(StandardPdfPipeline): class ExamplePictureClassifierPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions): def __init__(self, pipeline_options: ExamplePictureClassifierPipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.pipeline_options: ExamplePictureClassifierPipeline self.pipeline_options: ExamplePictureClassifierPipeline

View File

@ -0,0 +1,894 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "Ag9kcX2B_atc"
},
"source": [
"<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_azuresearch.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RAG with Azure AI Search"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"| Step | Tech | Execution |\n",
"| ------------------ | ------------------ | --------- |\n",
"| Embedding | Azure OpenAI | 🌐 Remote |\n",
"| Vector Store | Azure AI Search | 🌐 Remote |\n",
"| Gen AI | Azure OpenAI | 🌐 Remote |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## A recipe 🧑‍🍳 🐥 💚\n",
"\n",
"This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n",
"- [Docling](https://ds4sd.github.io/docling/) for document parsing and chunking\n",
"- [Azure AI Search](https://azure.microsoft.com/products/ai-services/ai-search/?msockid=0109678bea39665431e37323ebff6723) for vector indexing and retrieval\n",
"- [Azure OpenAI](https://azure.microsoft.com/products/ai-services/openai-service?msockid=0109678bea39665431e37323ebff6723) for embeddings and chat completion\n",
"\n",
"This sample demonstrates how to:\n",
"1. Parse a PDF with Docling.\n",
"2. Chunk the parsed text.\n",
"3. Use Azure OpenAI for embeddings.\n",
"4. Index and search in Azure AI Search.\n",
"5. Run a retrieval-augmented generation (RAG) query with Azure OpenAI GPT-4o.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If running in a fresh environment (like Google Colab), uncomment and run this single command:\n",
"%pip install \"docling~=2.12\" azure-search-documents==11.5.2 azure-identity openai rich torch python-dotenv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Part 0: Prerequisites\n",
" - **Azure AI Search** resource\n",
" - **Azure OpenAI** resource with a deployed embedding and chat completion model (e.g. `text-embedding-3-small` and `gpt-4o`) \n",
" - **Docling 2.12+** (installs `docling_core` automatically) Docling installed (Python 3.8+ environment)\n",
"\n",
"- A **GPU-enabled environment** is preferred for faster parsing. Docling 2.12 automatically detects GPU if present.\n",
" - If you only have CPU, parsing large PDFs can be slower. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"\n",
"def _get_env(key, default=None):\n",
" try:\n",
" from google.colab import userdata\n",
"\n",
" try:\n",
" return userdata.get(key)\n",
" except userdata.SecretNotFoundError:\n",
" pass\n",
" except ImportError:\n",
" pass\n",
" return os.getenv(key, default)\n",
"\n",
"\n",
"AZURE_SEARCH_ENDPOINT = _get_env(\"AZURE_SEARCH_ENDPOINT\")\n",
"AZURE_SEARCH_KEY = _get_env(\"AZURE_SEARCH_KEY\") # Ensure this is your Admin Key\n",
"AZURE_SEARCH_INDEX_NAME = _get_env(\"AZURE_SEARCH_INDEX_NAME\", \"docling-rag-sample\")\n",
"AZURE_OPENAI_ENDPOINT = _get_env(\"AZURE_OPENAI_ENDPOINT\")\n",
"AZURE_OPENAI_API_KEY = _get_env(\"AZURE_OPENAI_API_KEY\")\n",
"AZURE_OPENAI_API_VERSION = _get_env(\"AZURE_OPENAI_API_VERSION\", \"2024-10-21\")\n",
"AZURE_OPENAI_CHAT_MODEL = _get_env(\n",
" \"AZURE_OPENAI_CHAT_MODEL\"\n",
") # Using a deployed model named \"gpt-4o\"\n",
"AZURE_OPENAI_EMBEDDINGS = _get_env(\n",
" \"AZURE_OPENAI_EMBEDDINGS\", \"text-embedding-3-small\"\n",
") # Using a deployed model named \"text-embeddings-3-small\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Part 1: Parse the PDF with Docling\n",
"\n",
"Well parse the **Microsoft GraphRAG Research Paper** (~15 pages). Parsing should be relatively quick, even on CPU, but it will be faster on a GPU or MPS device if available.\n",
"\n",
"*(If you prefer a different document, simply provide a different URL or local file path.)*"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Parsing a ~</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">15</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">-page PDF. The process should be relatively quick, even on CPU...</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;33mParsing a ~\u001b[0m\u001b[1;33m15\u001b[0m\u001b[1;33m-page PDF. The process should be relatively quick, even on CPU\u001b[0m\u001b[1;33m...\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭─────────────────────────────────────────── Docling Markdown Preview ────────────────────────────────────────────╮\n",
"│ ## From Local to Global: A Graph RAG Approach to Query-Focused Summarization │\n",
"│ │\n",
"│ Darren Edge 1† │\n",
"│ │\n",
"│ Ha Trinh 1† │\n",
"│ │\n",
"│ Newman Cheng 2 │\n",
"│ │\n",
"│ Joshua Bradley 2 │\n",
"│ │\n",
"│ Alex Chao 3 │\n",
"│ │\n",
"│ Apurva Mody 3 │\n",
"│ │\n",
"│ Steven Truitt 2 │\n",
"│ │\n",
"│ ## Jonathan Larson 1 │\n",
"│ │\n",
"│ 1 Microsoft Research 2 Microsoft Strategic Missions and Technologies 3 Microsoft Office of the CTO │\n",
"│ │\n",
"│ { daedge,trinhha,newmancheng,joshbradley,achao,moapurva,steventruitt,jolarso } @microsoft.com │\n",
"│ │\n",
"│ † These authors contributed equally to this work │\n",
"│ │\n",
"│ ## Abstract │\n",
"│ │\n",
"│ The use of retrieval-augmented gen... │\n",
"╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
"</pre>\n"
],
"text/plain": [
"╭─────────────────────────────────────────── Docling Markdown Preview ────────────────────────────────────────────╮\n",
"│ ## From Local to Global: A Graph RAG Approach to Query-Focused Summarization │\n",
"│ │\n",
"│ Darren Edge 1† │\n",
"│ │\n",
"│ Ha Trinh 1† │\n",
"│ │\n",
"│ Newman Cheng 2 │\n",
"│ │\n",
"│ Joshua Bradley 2 │\n",
"│ │\n",
"│ Alex Chao 3 │\n",
"│ │\n",
"│ Apurva Mody 3 │\n",
"│ │\n",
"│ Steven Truitt 2 │\n",
"│ │\n",
"│ ## Jonathan Larson 1 │\n",
"│ │\n",
"│ 1 Microsoft Research 2 Microsoft Strategic Missions and Technologies 3 Microsoft Office of the CTO │\n",
"│ │\n",
"│ { daedge,trinhha,newmancheng,joshbradley,achao,moapurva,steventruitt,jolarso } @microsoft.com │\n",
"│ │\n",
"│ † These authors contributed equally to this work │\n",
"│ │\n",
"│ ## Abstract │\n",
"│ │\n",
"│ The use of retrieval-augmented gen... │\n",
"╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from rich.console import Console\n",
"from rich.panel import Panel\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"console = Console()\n",
"\n",
"# This URL points to the Microsoft GraphRAG Research Paper (arXiv: 2404.16130), ~15 pages\n",
"source_url = \"https://arxiv.org/pdf/2404.16130\"\n",
"\n",
"console.print(\n",
" \"[bold yellow]Parsing a ~15-page PDF. The process should be relatively quick, even on CPU...[/bold yellow]\"\n",
")\n",
"converter = DocumentConverter()\n",
"result = converter.convert(source_url)\n",
"\n",
"# Optional: preview the parsed Markdown\n",
"md_preview = result.document.export_to_markdown()\n",
"console.print(Panel(md_preview[:500] + \"...\", title=\"Docling Markdown Preview\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Part 2: Hierarchical Chunking\n",
"We convert the `Document` into smaller chunks for embedding and indexing. The built-in `HierarchicalChunker` preserves structure. "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Total chunks from PDF: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">106</span>\n",
"</pre>\n"
],
"text/plain": [
"Total chunks from PDF: \u001b[1;36m106\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from docling.chunking import HierarchicalChunker\n",
"\n",
"chunker = HierarchicalChunker()\n",
"doc_chunks = list(chunker.chunk(result.document))\n",
"\n",
"all_chunks = []\n",
"for idx, c in enumerate(doc_chunks):\n",
" chunk_text = c.text\n",
" all_chunks.append((f\"chunk_{idx}\", chunk_text))\n",
"\n",
"console.print(f\"Total chunks from PDF: {len(all_chunks)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Part 3: Create Azure AI Search Index and Push Chunk Embeddings\n",
"Well define a vector index in Azure AI Search, then embed each chunk using Azure OpenAI and upload in batches."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Index <span style=\"color: #008000; text-decoration-color: #008000\">'docling-rag-sample-2'</span> created.\n",
"</pre>\n"
],
"text/plain": [
"Index \u001b[32m'docling-rag-sample-2'\u001b[0m created.\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from azure.core.credentials import AzureKeyCredential\n",
"from azure.search.documents.indexes import SearchIndexClient\n",
"from azure.search.documents.indexes.models import (\n",
" AzureOpenAIVectorizer,\n",
" AzureOpenAIVectorizerParameters,\n",
" HnswAlgorithmConfiguration,\n",
" SearchableField,\n",
" SearchField,\n",
" SearchFieldDataType,\n",
" SearchIndex,\n",
" SimpleField,\n",
" VectorSearch,\n",
" VectorSearchProfile,\n",
")\n",
"from rich.console import Console\n",
"\n",
"console = Console()\n",
"\n",
"VECTOR_DIM = 1536 # Adjust based on your chosen embeddings model\n",
"\n",
"index_client = SearchIndexClient(\n",
" AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY)\n",
")\n",
"\n",
"\n",
"def create_search_index(index_name: str):\n",
" # Define fields\n",
" fields = [\n",
" SimpleField(name=\"chunk_id\", type=SearchFieldDataType.String, key=True),\n",
" SearchableField(name=\"content\", type=SearchFieldDataType.String),\n",
" SearchField(\n",
" name=\"content_vector\",\n",
" type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n",
" searchable=True,\n",
" filterable=False,\n",
" sortable=False,\n",
" facetable=False,\n",
" vector_search_dimensions=VECTOR_DIM,\n",
" vector_search_profile_name=\"default\",\n",
" ),\n",
" ]\n",
" # Vector search config with an AzureOpenAIVectorizer\n",
" vector_search = VectorSearch(\n",
" algorithms=[HnswAlgorithmConfiguration(name=\"default\")],\n",
" profiles=[\n",
" VectorSearchProfile(\n",
" name=\"default\",\n",
" algorithm_configuration_name=\"default\",\n",
" vectorizer_name=\"default\",\n",
" )\n",
" ],\n",
" vectorizers=[\n",
" AzureOpenAIVectorizer(\n",
" vectorizer_name=\"default\",\n",
" parameters=AzureOpenAIVectorizerParameters(\n",
" resource_url=AZURE_OPENAI_ENDPOINT,\n",
" deployment_name=AZURE_OPENAI_EMBEDDINGS,\n",
" model_name=\"text-embedding-3-small\",\n",
" api_key=AZURE_OPENAI_API_KEY,\n",
" ),\n",
" )\n",
" ],\n",
" )\n",
"\n",
" # Create or update the index\n",
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
" try:\n",
" index_client.delete_index(index_name)\n",
" except:\n",
" pass\n",
"\n",
" index_client.create_or_update_index(new_index)\n",
" console.print(f\"Index '{index_name}' created.\")\n",
"\n",
"\n",
"create_search_index(AZURE_SEARCH_INDEX_NAME)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Generate Embeddings and Upload to Azure AI Search\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploaded batch <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span> -&gt; <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">50</span>; all_succeeded: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, first_doc_status_code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">201</span>\n",
"</pre>\n"
],
"text/plain": [
"Uploaded batch \u001b[1;36m0\u001b[0m -> \u001b[1;36m50\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploaded batch <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">50</span> -&gt; <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">100</span>; all_succeeded: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, first_doc_status_code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">201</span>\n",
"</pre>\n"
],
"text/plain": [
"Uploaded batch \u001b[1;36m50\u001b[0m -> \u001b[1;36m100\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Uploaded batch <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">100</span> -&gt; <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">106</span>; all_succeeded: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>, first_doc_status_code: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">201</span>\n",
"</pre>\n"
],
"text/plain": [
"Uploaded batch \u001b[1;36m100\u001b[0m -> \u001b[1;36m106\u001b[0m; all_succeeded: \u001b[3;92mTrue\u001b[0m, first_doc_status_code: \u001b[1;36m201\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">All chunks uploaded to Azure Search.\n",
"</pre>\n"
],
"text/plain": [
"All chunks uploaded to Azure Search.\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from azure.search.documents import SearchClient\n",
"from openai import AzureOpenAI\n",
"\n",
"search_client = SearchClient(\n",
" AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)\n",
")\n",
"openai_client = AzureOpenAI(\n",
" api_key=AZURE_OPENAI_API_KEY,\n",
" api_version=AZURE_OPENAI_API_VERSION,\n",
" azure_endpoint=AZURE_OPENAI_ENDPOINT,\n",
")\n",
"\n",
"\n",
"def embed_text(text: str):\n",
" \"\"\"\n",
" Helper to generate embeddings with Azure OpenAI.\n",
" \"\"\"\n",
" response = openai_client.embeddings.create(\n",
" input=text, model=AZURE_OPENAI_EMBEDDINGS\n",
" )\n",
" return response.data[0].embedding\n",
"\n",
"\n",
"upload_docs = []\n",
"for chunk_id, chunk_text in all_chunks:\n",
" embedding_vector = embed_text(chunk_text)\n",
" upload_docs.append(\n",
" {\n",
" \"chunk_id\": chunk_id,\n",
" \"content\": chunk_text,\n",
" \"content_vector\": embedding_vector,\n",
" }\n",
" )\n",
"\n",
"\n",
"BATCH_SIZE = 50\n",
"for i in range(0, len(upload_docs), BATCH_SIZE):\n",
" subset = upload_docs[i : i + BATCH_SIZE]\n",
" resp = search_client.upload_documents(documents=subset)\n",
"\n",
" all_succeeded = all(r.succeeded for r in resp)\n",
" console.print(\n",
" f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
" f\"first_doc_status_code: {resp[0].status_code}\"\n",
" )\n",
"\n",
"console.print(\"All chunks uploaded to Azure Search.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Part 4: Perform RAG over PDF\n",
"Combine retrieval from Azure AI Search with Azure OpenAI Chat Completions (aka. grounding your LLM)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╭──────────────────────────────────────────────────</span> RAG Prompt <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">───────────────────────────────────────────────────╮</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ You are an AI assistant helping answering questions about Microsoft GraphRAG. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Use ONLY the text below to answer the user's question. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ If the answer isn't in the text, say you don't know. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Context: │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Community summaries vs. source texts. When comparing community summaries to source texts using Graph RAG, │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ community summaries generally provided a small but consistent improvement in answer comprehensiveness and │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ diversity, except for root-level summaries. Intermediate-level summaries in the Podcast dataset and low-level │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ community summaries in the News dataset achieved comprehensiveness win rates of 57% and 64%, respectively. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Diversity win rates were 57% for Podcast intermediate-level summaries and 60% for News low-level community │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summaries. Table 3 also illustrates the scalability advantages of Graph RAG compared to source text │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summarization: for low-level community summaries ( C3 ), Graph RAG required 26-33% fewer context tokens, while │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ for root-level community summaries ( C0 ), it required over 97% fewer tokens. For a modest drop in performance │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ compared with other global methods, root-level Graph RAG offers a highly efficient method for the iterative │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ question answering that characterizes sensemaking activity, while retaining advantages in comprehensiveness │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (72% win rate) and diversity (62% win rate) over na¨ıve RAG. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ We have presented a global approach to Graph RAG, combining knowledge graph generation, retrieval-augmented │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generation (RAG), and query-focused summarization (QFS) to support human sensemaking over entire text corpora. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Initial evaluations show substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ diversity of answers, as well as favorable comparisons to a global but graph-free approach using map-reduce │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ source text summarization. For situations requiring many global queries over the same dataset, summaries of │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ root-level communities in the entity-based graph index provide a data index that is both superior to na¨ıve RAG │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ and achieves competitive performance to other global methods at a fraction of the token cost. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Trade-offs of building a graph index . We consistently observed Graph RAG achieve the best headto-head results │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ against other methods, but in many cases the graph-free approach to global summarization of source texts │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ performed competitively. The real-world decision about whether to invest in building a graph index depends on │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ multiple factors, including the compute budget, expected number of lifetime queries per dataset, and value │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ obtained from other aspects of the graph index (including the generic community summaries and the use of other │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ graph-related RAG approaches). │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Future work . The graph index, rich text annotations, and hierarchical community structure supporting the │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ current Graph RAG approach offer many possibilities for refinement and adaptation. This includes RAG approaches │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ that operate in a more local manner, via embedding-based matching of user queries and graph annotations, as │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ well as the possibility of hybrid RAG schemes that combine embedding-based matching against community reports │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ before employing our map-reduce summarization mechanisms. This 'roll-up' operation could also be extended │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ across more levels of the community hierarchy, as well as implemented as a more exploratory 'drill down' │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ mechanism that follows the information scent contained in higher-level community summaries. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Advanced RAG systems include pre-retrieval, retrieval, post-retrieval strategies designed to overcome the │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ drawbacks of Na¨ıve RAG, while Modular RAG systems include patterns for iterative and dynamic cycles of │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ interleaved retrieval and generation (Gao et al., 2023). Our implementation of Graph RAG incorporates multiple │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ concepts related to other systems. For example, our community summaries are a kind of self-memory (Selfmem, │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Cheng et al., 2024) for generation-augmented retrieval (GAR, Mao et al., 2020) that facilitates future │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generation cycles, while our parallel generation of community answers from these summaries is a kind of │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ iterative (Iter-RetGen, Shao et al., 2023) or federated (FeB4RAG, Wang et al., 2024) retrieval-generation │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ strategy. Other systems have also combined these concepts for multi-document summarization (CAiRE-COVID, Su et │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ al., 2020) and multi-hop question answering (ITRG, Feng et al., 2023; IR-CoT, Trivedi et al., 2022; DSP, │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Khattab et al., 2022). Our use of a hierarchical index and summarization also bears resemblance to further │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ approaches, such as generating a hierarchical index of text chunks by clustering the vectors of text embeddings │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (RAPTOR, Sarthi et al., 2024) or generating a 'tree of clarifications' to answer multiple interpretations of │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ ambiguous questions (Kim et al., 2023). However, none of these iterative or hierarchical approaches use the │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ kind of self-generated graph index that enables Graph RAG. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ The use of retrieval-augmented generation (RAG) to retrieve relevant information from an external knowledge │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ source enables large language models (LLMs) to answer questions over private and/or previously unseen document │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ collections. However, RAG fails on global questions directed at an entire text corpus, such as 'What are the │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ main themes in the dataset?', since this is inherently a queryfocused summarization (QFS) task, rather than an │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ explicit retrieval task. Prior QFS methods, meanwhile, fail to scale to the quantities of text indexed by │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ typical RAGsystems. To combine the strengths of these contrasting methods, we propose a Graph RAG approach to │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ question answering over private text corpora that scales with both the generality of user questions and the │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ quantity of source text to be indexed. Our approach uses an LLM to build a graph-based text index in two │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ stages: first to derive an entity knowledge graph from the source documents, then to pregenerate community │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summaries for all groups of closely-related entities. Given a question, each community summary is used to │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generate a partial response, before all partial responses are again summarized in a final response to the user. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ For a class of global sensemaking questions over datasets in the 1 million token range, we show that Graph RAG │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ leads to substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and diversity of │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ generated answers. An open-source, Python-based implementation of both global and local Graph RAG approaches is │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ forthcoming at https://aka . ms/graphrag . │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Given the multi-stage nature of our Graph RAG mechanism, the multiple conditions we wanted to compare, and the │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ lack of gold standard answers to our activity-based sensemaking questions, we decided to adopt a head-to-head │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ comparison approach using an LLM evaluator. We selected three target metrics capturing qualities that are │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ desirable for sensemaking activities, as well as a control metric (directness) used as a indicator of validity. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Since directness is effectively in opposition to comprehensiveness and diversity, we would not expect any │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ method to win across all four metrics. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Figure 1: Graph RAG pipeline using an LLM-derived graph index of source document text. This index spans nodes │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (e.g., entities), edges (e.g., relationships), and covariates (e.g., claims) that have been detected, │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ extracted, and summarized by LLM prompts tailored to the domain of the dataset. Community detection (e.g., │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Leiden, Traag et al., 2019) is used to partition the graph index into groups of elements (nodes, edges, │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ covariates) that the LLM can summarize in parallel at both indexing time and query time. The 'global answer' to │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ a given query is produced using a final round of query-focused summarization over all community summaries │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ reporting relevance to that query. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Retrieval-augmented generation (RAG, Lewis et al., 2020) is an established approach to answering user questions │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ over entire datasets, but it is designed for situations where these answers are contained locally within │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ regions of text whose retrieval provides sufficient grounding for the generation task. Instead, a more │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ appropriate task framing is query-focused summarization (QFS, Dang, 2006), and in particular, query-focused │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ abstractive summarization that generates natural language summaries and not just concatenated excerpts (Baumel │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ et al., 2018; Laskar et al., 2020; Yao et al., 2017) . In recent years, however, such distinctions between │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ summarization tasks that are abstractive versus extractive, generic versus query-focused, and single-document │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ versus multi-document, have become less relevant. While early applications of the transformer architecture │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ showed substantial improvements on the state-of-the-art for all such summarization tasks (Goodwin et al., 2020; │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Laskar et al., 2022; Liu and Lapata, 2019), these tasks are now trivialized by modern LLMs, including the GPT │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ (Achiam et al., 2023; Brown et al., 2020), Llama (Touvron et al., 2023), and Gemini (Anil et al., 2023) series, │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ all of which can use in-context learning to summarize any content provided in their context window. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ --- │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ community descriptions provide complete coverage of the underlying graph index and the input documents it │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ represents. Query-focused summarization of an entire corpus is then made possible using a map-reduce approach: │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ first using each community summary to answer the query independently and in parallel, then summarizing all │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ relevant partial answers into a final global answer. │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Question: What are the main advantages of using the Graph RAG approach for query-focused summarization compared │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ to traditional RAG methods? │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ Answer: │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;31m╭─\u001b[0m\u001b[1;31m─────────────────────────────────────────────────\u001b[0m RAG Prompt \u001b[1;31m──────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mYou are an AI assistant helping answering questions about Microsoft GraphRAG.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mUse ONLY the text below to answer the user's question.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mIf the answer isn't in the text, say you don't know.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mContext:\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mCommunity summaries vs. source texts. When comparing community summaries to source texts using Graph RAG, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcommunity summaries generally provided a small but consistent improvement in answer comprehensiveness and \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdiversity, except for root-level summaries. Intermediate-level summaries in the Podcast dataset and low-level \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcommunity summaries in the News dataset achieved comprehensiveness win rates of 57% and 64%, respectively. \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mDiversity win rates were 57% for Podcast intermediate-level summaries and 60% for News low-level community \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummaries. Table 3 also illustrates the scalability advantages of Graph RAG compared to source text \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummarization: for low-level community summaries ( C3 ), Graph RAG required 26-33% fewer context tokens, while \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mfor root-level community summaries ( C0 ), it required over 97% fewer tokens. For a modest drop in performance \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcompared with other global methods, root-level Graph RAG offers a highly efficient method for the iterative \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mquestion answering that characterizes sensemaking activity, while retaining advantages in comprehensiveness \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(72% win rate) and diversity (62% win rate) over na¨ıve RAG.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mWe have presented a global approach to Graph RAG, combining knowledge graph generation, retrieval-augmented \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgeneration (RAG), and query-focused summarization (QFS) to support human sensemaking over entire text corpora. \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mInitial evaluations show substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdiversity of answers, as well as favorable comparisons to a global but graph-free approach using map-reduce \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msource text summarization. For situations requiring many global queries over the same dataset, summaries of \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mroot-level communities in the entity-based graph index provide a data index that is both superior to na¨ıve RAG\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mand achieves competitive performance to other global methods at a fraction of the token cost.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mTrade-offs of building a graph index . We consistently observed Graph RAG achieve the best headto-head results \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31magainst other methods, but in many cases the graph-free approach to global summarization of source texts \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mperformed competitively. The real-world decision about whether to invest in building a graph index depends on \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmultiple factors, including the compute budget, expected number of lifetime queries per dataset, and value \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mobtained from other aspects of the graph index (including the generic community summaries and the use of other \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgraph-related RAG approaches).\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mFuture work . The graph index, rich text annotations, and hierarchical community structure supporting the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcurrent Graph RAG approach offer many possibilities for refinement and adaptation. This includes RAG approaches\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mthat operate in a more local manner, via embedding-based matching of user queries and graph annotations, as \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mwell as the possibility of hybrid RAG schemes that combine embedding-based matching against community reports \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mbefore employing our map-reduce summarization mechanisms. This 'roll-up' operation could also be extended \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31macross more levels of the community hierarchy, as well as implemented as a more exploratory 'drill down' \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmechanism that follows the information scent contained in higher-level community summaries.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mAdvanced RAG systems include pre-retrieval, retrieval, post-retrieval strategies designed to overcome the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdrawbacks of Na¨ıve RAG, while Modular RAG systems include patterns for iterative and dynamic cycles of \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31minterleaved retrieval and generation (Gao et al., 2023). Our implementation of Graph RAG incorporates multiple \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mconcepts related to other systems. For example, our community summaries are a kind of self-memory (Selfmem, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mCheng et al., 2024) for generation-augmented retrieval (GAR, Mao et al., 2020) that facilitates future \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgeneration cycles, while our parallel generation of community answers from these summaries is a kind of \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31miterative (Iter-RetGen, Shao et al., 2023) or federated (FeB4RAG, Wang et al., 2024) retrieval-generation \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mstrategy. Other systems have also combined these concepts for multi-document summarization (CAiRE-COVID, Su et \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mal., 2020) and multi-hop question answering (ITRG, Feng et al., 2023; IR-CoT, Trivedi et al., 2022; DSP, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mKhattab et al., 2022). Our use of a hierarchical index and summarization also bears resemblance to further \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mapproaches, such as generating a hierarchical index of text chunks by clustering the vectors of text embeddings\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(RAPTOR, Sarthi et al., 2024) or generating a 'tree of clarifications' to answer multiple interpretations of \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mambiguous questions (Kim et al., 2023). However, none of these iterative or hierarchical approaches use the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mkind of self-generated graph index that enables Graph RAG.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mThe use of retrieval-augmented generation (RAG) to retrieve relevant information from an external knowledge \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msource enables large language models (LLMs) to answer questions over private and/or previously unseen document \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcollections. However, RAG fails on global questions directed at an entire text corpus, such as 'What are the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmain themes in the dataset?', since this is inherently a queryfocused summarization (QFS) task, rather than an \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mexplicit retrieval task. Prior QFS methods, meanwhile, fail to scale to the quantities of text indexed by \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mtypical RAGsystems. To combine the strengths of these contrasting methods, we propose a Graph RAG approach to \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mquestion answering over private text corpora that scales with both the generality of user questions and the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mquantity of source text to be indexed. Our approach uses an LLM to build a graph-based text index in two \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mstages: first to derive an entity knowledge graph from the source documents, then to pregenerate community \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummaries for all groups of closely-related entities. Given a question, each community summary is used to \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgenerate a partial response, before all partial responses are again summarized in a final response to the user.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mFor a class of global sensemaking questions over datasets in the 1 million token range, we show that Graph RAG \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mleads to substantial improvements over a na¨ıve RAG baseline for both the comprehensiveness and diversity of \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mgenerated answers. An open-source, Python-based implementation of both global and local Graph RAG approaches is\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mforthcoming at https://aka . ms/graphrag .\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mGiven the multi-stage nature of our Graph RAG mechanism, the multiple conditions we wanted to compare, and the \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mlack of gold standard answers to our activity-based sensemaking questions, we decided to adopt a head-to-head \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcomparison approach using an LLM evaluator. We selected three target metrics capturing qualities that are \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mdesirable for sensemaking activities, as well as a control metric (directness) used as a indicator of validity.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mSince directness is effectively in opposition to comprehensiveness and diversity, we would not expect any \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mmethod to win across all four metrics.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mFigure 1: Graph RAG pipeline using an LLM-derived graph index of source document text. This index spans nodes \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(e.g., entities), edges (e.g., relationships), and covariates (e.g., claims) that have been detected, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mextracted, and summarized by LLM prompts tailored to the domain of the dataset. Community detection (e.g., \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mLeiden, Traag et al., 2019) is used to partition the graph index into groups of elements (nodes, edges, \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcovariates) that the LLM can summarize in parallel at both indexing time and query time. The 'global answer' to\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31ma given query is produced using a final round of query-focused summarization over all community summaries \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mreporting relevance to that query.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mRetrieval-augmented generation (RAG, Lewis et al., 2020) is an established approach to answering user questions\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mover entire datasets, but it is designed for situations where these answers are contained locally within \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mregions of text whose retrieval provides sufficient grounding for the generation task. Instead, a more \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mappropriate task framing is query-focused summarization (QFS, Dang, 2006), and in particular, query-focused \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mabstractive summarization that generates natural language summaries and not just concatenated excerpts (Baumel \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31met al., 2018; Laskar et al., 2020; Yao et al., 2017) . In recent years, however, such distinctions between \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31msummarization tasks that are abstractive versus extractive, generic versus query-focused, and single-document \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mversus multi-document, have become less relevant. While early applications of the transformer architecture \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mshowed substantial improvements on the state-of-the-art for all such summarization tasks (Goodwin et al., 2020;\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mLaskar et al., 2022; Liu and Lapata, 2019), these tasks are now trivialized by modern LLMs, including the GPT \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m(Achiam et al., 2023; Brown et al., 2020), Llama (Touvron et al., 2023), and Gemini (Anil et al., 2023) series,\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mall of which can use in-context learning to summarize any content provided in their context window.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m---\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mcommunity descriptions provide complete coverage of the underlying graph index and the input documents it \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mrepresents. Query-focused summarization of an entire corpus is then made possible using a map-reduce approach: \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mfirst using each community summary to answer the query independently and in parallel, then summarizing all \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mrelevant partial answers into a final global answer.\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mQuestion: What are the main advantages of using the Graph RAG approach for query-focused summarization compared\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mto traditional RAG methods?\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31mAnswer:\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m│\u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m│\u001b[0m\n",
"\u001b[1;31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭─────────────────────────────────────────────────</span> RAG Response <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">──────────────────────────────────────────────────╮</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ The main advantages of using the Graph RAG approach for query-focused summarization compared to traditional RAG │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ methods include: │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 1. **Improved Comprehensiveness and Diversity**: Graph RAG shows substantial improvements over a naïve RAG │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ baseline in terms of the comprehensiveness and diversity of answers. This is particularly beneficial for global │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ sensemaking questions over large datasets. │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 2. **Scalability**: Graph RAG provides scalability advantages, achieving efficient summarization with │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ significantly fewer context tokens required. For instance, it requires 26-33% fewer tokens for low-level │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ community summaries and over 97% fewer tokens for root-level summaries compared to source text summarization. │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 3. **Efficiency in Iterative Question Answering**: Root-level Graph RAG offers a highly efficient method for │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ iterative question answering, which is crucial for sensemaking activities, with only a modest drop in │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ performance compared to other global methods. │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 4. **Global Query Handling**: It supports handling global queries effectively, as it combines knowledge graph │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ generation, retrieval-augmented generation, and query-focused summarization, making it suitable for sensemaking │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ over entire text corpora. │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 5. **Hierarchical Indexing and Summarization**: The use of a hierarchical index and summarization allows for │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ efficient processing and summarizing of community summaries into a final global answer, facilitating a │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ comprehensive coverage of the underlying graph index and input documents. │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ 6. **Reduced Token Cost**: For situations requiring many global queries over the same dataset, Graph RAG │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│ achieves competitive performance to other global methods at a fraction of the token cost. │</span>\n",
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;32m╭─\u001b[0m\u001b[1;32m────────────────────────────────────────────────\u001b[0m RAG Response \u001b[1;32m─────────────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mThe main advantages of using the Graph RAG approach for query-focused summarization compared to traditional RAG\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mmethods include:\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m1. **Improved Comprehensiveness and Diversity**: Graph RAG shows substantial improvements over a naïve RAG \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mbaseline in terms of the comprehensiveness and diversity of answers. This is particularly beneficial for global\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32msensemaking questions over large datasets.\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m2. **Scalability**: Graph RAG provides scalability advantages, achieving efficient summarization with \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32msignificantly fewer context tokens required. For instance, it requires 26-33% fewer tokens for low-level \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mcommunity summaries and over 97% fewer tokens for root-level summaries compared to source text summarization.\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m3. **Efficiency in Iterative Question Answering**: Root-level Graph RAG offers a highly efficient method for \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32miterative question answering, which is crucial for sensemaking activities, with only a modest drop in \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mperformance compared to other global methods.\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m4. **Global Query Handling**: It supports handling global queries effectively, as it combines knowledge graph \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mgeneration, retrieval-augmented generation, and query-focused summarization, making it suitable for sensemaking\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mover entire text corpora.\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m5. **Hierarchical Indexing and Summarization**: The use of a hierarchical index and summarization allows for \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mefficient processing and summarizing of community summaries into a final global answer, facilitating a \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mcomprehensive coverage of the underlying graph index and input documents.\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m6. **Reduced Token Cost**: For situations requiring many global queries over the same dataset, Graph RAG \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32machieves competitive performance to other global methods at a fraction of the token cost.\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from azure.search.documents.models import VectorizableTextQuery\n",
"\n",
"\n",
"def generate_chat_response(prompt: str, system_message: str = None):\n",
" \"\"\"\n",
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
" If you need multi-turn conversation or follow-up queries, you'll have to\n",
" maintain the messages list externally.\n",
" \"\"\"\n",
" messages = []\n",
" if system_message:\n",
" messages.append({\"role\": \"system\", \"content\": system_message})\n",
" messages.append({\"role\": \"user\", \"content\": prompt})\n",
"\n",
" completion = openai_client.chat.completions.create(\n",
" model=AZURE_OPENAI_CHAT_MODEL, messages=messages, temperature=0.7\n",
" )\n",
" return completion.choices[0].message.content\n",
"\n",
"\n",
"user_query = \"What are the main advantages of using the Graph RAG approach for query-focused summarization compared to traditional RAG methods?\"\n",
"user_embed = embed_text(user_query)\n",
"\n",
"vector_query = VectorizableTextQuery(\n",
" text=user_query, # passing in text for a hybrid search\n",
" k_nearest_neighbors=5,\n",
" fields=\"content_vector\",\n",
")\n",
"\n",
"search_results = search_client.search(\n",
" search_text=user_query, vector_queries=[vector_query], select=[\"content\"], top=10\n",
")\n",
"\n",
"retrieved_chunks = []\n",
"for result in search_results:\n",
" snippet = result[\"content\"]\n",
" retrieved_chunks.append(snippet)\n",
"\n",
"context_str = \"\\n---\\n\".join(retrieved_chunks)\n",
"rag_prompt = f\"\"\"\n",
"You are an AI assistant helping answering questions about Microsoft GraphRAG.\n",
"Use ONLY the text below to answer the user's question.\n",
"If the answer isn't in the text, say you don't know.\n",
"\n",
"Context:\n",
"{context_str}\n",
"\n",
"Question: {user_query}\n",
"Answer:\n",
"\"\"\"\n",
"\n",
"final_answer = generate_chat_response(rag_prompt)\n",
"\n",
"console.print(Panel(rag_prompt, title=\"RAG Prompt\", style=\"bold red\"))\n",
"console.print(Panel(final_answer, title=\"RAG Response\", style=\"bold green\"))"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,37 @@
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/2206.01062.pdf")
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"])
ocr_options = TesseractCliOcrOptions(lang=["auto"])
pipeline_options = PdfPipelineOptions(
do_ocr=True, force_full_page_ocr=True, ocr_options=ocr_options
)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
doc = converter.convert(input_doc).document
md = doc.export_to_markdown()
print(md)
if __name__ == "__main__":
main()

View File

@ -24,6 +24,20 @@ docling https://arxiv.org/pdf/2206.01062
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md). To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
### Supported formats
The document conversion in Docling supports several popular formats, including:
- **PDF** (Portable Document Format): the format developed by Adobe to present documents compatible across application software, hardware, and operating systems.
- **.docx**, **.xlsx**, **.pptx** (Word, Excel, and PowerPoint): the Open XML formats suppored by Microsof Office.
- **Markdown**: a lightweight markup language to add formatting elements to plain text documents.
- **AsciiDoc**: a plain text markup language for writing technical content.
- **HTML** (Hypertext Markup Language): the standard markup language for creating web pages.
- **XHTML** (Extensible Hypertext Markup Language): the XML-based version of HTML.
- **XML** (Extensible Markup Language): a markup format for storing and transmitting data. Due to its flexibility, Docling requires custom implementations to identify the
semantics of the data. Currently, Docling supports the parsing of [USPTO](https://www.uspto.gov/patents) patents and [PubMed Central® (PMC)](https://pmc.ncbi.nlm.nih.gov/) articles.
### Advanced options ### Advanced options
#### Adjust pipeline features #### Adjust pipeline features
@ -126,6 +140,32 @@ result = converter.convert(source)
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads. You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
#### Use specific backend converters
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](#supported-formats)).
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
```python
import urllib.request
from io import BytesIO
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
url = "https://en.wikipedia.org/wiki/Duck"
text = urllib.request.urlopen(url).read()
in_doc = InputDocument(
path_or_stream=BytesIO(text),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="duck.html",
)
backend = HTMLDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(text))
result = backend.convert()
print(result.export_to_markdown())
```
## Chunking ## Chunking
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a

View File

@ -75,16 +75,20 @@ nav:
- "Table export": examples/export_tables.py - "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py - "Multimodal export": examples/export_multimodal.py
- "Force full page OCR": examples/full_page_ocr.py - "Force full page OCR": examples/full_page_ocr.py
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
- "Accelerator options": examples/run_with_accelerator.py - "Accelerator options": examples/run_with_accelerator.py
- "Simple translation": examples/translate.py - "Simple translation": examples/translate.py
- examples/backend_xml_rag.ipynb
- ✂️ Chunking: - ✂️ Chunking:
- "Hybrid chunking": examples/hybrid_chunking.ipynb - examples/hybrid_chunking.ipynb
- 💬 RAG / QA: - 🤖 RAG with AI dev frameworks:
- examples/rag_haystack.ipynb - examples/rag_haystack.ipynb
- examples/rag_llamaindex.ipynb
- examples/rag_langchain.ipynb - examples/rag_langchain.ipynb
- examples/rag_llamaindex.ipynb
- 🗂️ More examples:
- examples/rag_weaviate.ipynb - examples/rag_weaviate.ipynb
- RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb - RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb
- examples/rag_azuresearch.ipynb
- examples/retrieval_qdrant.ipynb - examples/retrieval_qdrant.ipynb
- Integrations: - Integrations:
- Integrations: integrations/index.md - Integrations: integrations/index.md

10
poetry.lock generated
View File

@ -3823,10 +3823,10 @@ files = [
numpy = [ numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
[[package]] [[package]]
@ -3849,10 +3849,10 @@ files = [
numpy = [ numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
[[package]] [[package]]
@ -4037,8 +4037,8 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
] ]
python-dateutil = ">=2.8.2" python-dateutil = ">=2.8.2"
pytz = ">=2020.1" pytz = ">=2020.1"

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "docling" name = "docling"
version = "2.15.1" # DO NOT EDIT, updated automatically version = "2.16.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT" license = "MIT"
@ -25,9 +25,9 @@ packages = [{include = "docling"}]
# actual dependencies: # actual dependencies:
###################### ######################
python = "^3.9" python = "^3.9"
docling-core = { version = "^2.13.1", extras = ["chunking"] }
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-ibm-models = "^3.1.0" docling-core = { version = "^2.15.1", extras = ["chunking"] }
docling-ibm-models = "^3.3.0"
deepsearch-glm = "^1.0.0" deepsearch-glm = "^1.0.0"
docling-parse = "^3.1.0" docling-parse = "^3.1.0"
filetype = "^1.2.0" filetype = "^1.2.0"
@ -57,6 +57,7 @@ onnxruntime = [
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
] ]
pylatexenc = "^2.10" pylatexenc = "^2.10"
pillow = "^10.0.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}

Binary file not shown.

View File

@ -0,0 +1,13 @@
<document>
<subtitle-level-1><location><page_1><loc_22><loc_83><loc_45><loc_84></location>Java Code Example</subtitle-level-1>
<paragraph><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
<paragraph><location><page_1><loc_39><loc_61><loc_61><loc_62></location>Listing 1: Simple Java Program</paragraph>
<paragraph><location><page_1><loc_22><loc_56><loc_55><loc_60></location>public static void print() { System.out.println( "Java Code" ); }</paragraph>
<paragraph><location><page_1><loc_22><loc_37><loc_78><loc_55></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
<subtitle-level-1><location><page_2><loc_22><loc_84><loc_32><loc_85></location>Formula</subtitle-level-1>
<paragraph><location><page_2><loc_22><loc_65><loc_80><loc_82></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
<paragraph><location><page_2><loc_22><loc_58><loc_80><loc_65></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt.</paragraph>
<paragraph><location><page_2><loc_22><loc_38><loc_80><loc_55></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
<paragraph><location><page_2><loc_22><loc_29><loc_80><loc_38></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</paragraph>
<paragraph><location><page_2><loc_22><loc_21><loc_80><loc_29></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</paragraph>
</document>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,19 @@
## Java Code Example
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Listing 1: Simple Java Program
public static void print() { System.out.println( "Java Code" ); }
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
## Formula
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt.
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,17 @@
<document>
<subtitle-level-1><location><page_1><loc_22><loc_83><loc_41><loc_84></location>Figures Example</subtitle-level-1>
<paragraph><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
<caption><location><page_1><loc_37><loc_32><loc_63><loc_33></location>Figure 1: This is an example image.</caption>
<figure>
<location><page_1><loc_22><loc_36><loc_78><loc_62></location>
<caption>Figure 1: This is an example image.</caption>
</figure>
<paragraph><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</paragraph>
<paragraph><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
<caption><location><page_2><loc_37><loc_33><loc_63><loc_34></location>Figure 2: This is an example image.</caption>
<figure>
<location><page_2><loc_36><loc_36><loc_64><loc_65></location>
<caption>Figure 2: This is an example image.</caption>
</figure>
<paragraph><location><page_2><loc_22><loc_15><loc_78><loc_31></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</paragraph>
</document>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,15 @@
## Figures Example
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Figure 1: This is an example image.
<!-- image -->
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Figure 2: This is an example image.
<!-- image -->
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,14 @@
<document>
<section_header_level_1><location><page_1><loc_22><loc_83><loc_45><loc_84></location>Java Code Example</section_header_level_1>
<text><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
<paragraph><location><page_1><loc_39><loc_61><loc_61><loc_62></location>Listing 1: Simple Java Program</paragraph>
<code><location><page_1><loc_22><loc_56><loc_55><loc_60></location>public static void print() { System.out.println( "Java Code" ); }</code>
<text><location><page_1><loc_22><loc_37><loc_78><loc_55></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
<section_header_level_1><location><page_2><loc_22><loc_84><loc_32><loc_85></location>Formula</section_header_level_1>
<text><location><page_2><loc_22><loc_65><loc_80><loc_82></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
<text><location><page_2><loc_22><loc_58><loc_80><loc_65></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt.</text>
<formula><location><page_2><loc_47><loc_56><loc_56><loc_57></location>a 2 + 8 = 12</formula>
<text><location><page_2><loc_22><loc_38><loc_80><loc_55></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
<text><location><page_2><loc_22><loc_29><loc_80><loc_38></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</text>
<text><location><page_2><loc_22><loc_21><loc_80><loc_29></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</text>
</document>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,25 @@
## Java Code Example
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Listing 1: Simple Java Program
```
public static void print() { System.out.println( "Java Code" ); }
```
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
## Formula
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt.
a 2 + 8 = 12
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.
Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,15 @@
<document>
<section_header_level_1><location><page_1><loc_22><loc_83><loc_41><loc_84></location>Figures Example</section_header_level_1>
<text><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
<figure>
<location><page_1><loc_22><loc_36><loc_78><loc_62></location>
<caption>Figure 1: This is an example image.</caption>
</figure>
<text><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</text>
<text><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
<figure>
<location><page_2><loc_36><loc_36><loc_64><loc_65></location>
<caption>Figure 2: This is an example image.</caption>
</figure>
<text><location><page_2><loc_22><loc_15><loc_78><loc_31></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</text>
</document>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,17 @@
## Figures Example
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Figure 1: This is an example image.
<!-- image -->
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
Figure 2: This is an example image.
<!-- image -->
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.

File diff suppressed because one or more lines are too long

View File

@ -8,3 +8,4 @@ item-0 at level 0: unspecified: group _root_
item-7 at level 1: section: group sheet: Sheet3 item-7 at level 1: section: group sheet: Sheet3
item-8 at level 2: table with [7x3] item-8 at level 2: table with [7x3]
item-9 at level 2: table with [7x3] item-9 at level 2: table with [7x3]
item-10 at level 2: picture

File diff suppressed because one or more lines are too long

View File

@ -49,3 +49,5 @@
| 3 | 6 | 7 | | 3 | 6 | 7 |
| 8 | 9 | 9 | | 8 | 9 | 9 |
| 10 | 9 | 9 | | 10 | 9 | 9 |
<!-- image -->

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,58 @@
"""Test methods in module docling.backend.json.docling_json_backend.py."""
from io import BytesIO
from pathlib import Path
import pytest
from pydantic import ValidationError
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json")
def test_convert_valid_docling_json():
"""Test ingestion of valid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = GT_PATH
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)
assert backend.is_valid()
act_doc = backend.convert()
act_data = act_doc.export_to_dict()
exp_doc = DoclingDocument.load_from_json(GT_PATH)
exp_data = exp_doc.export_to_dict()
assert act_data == exp_data
def test_invalid_docling_json():
"""Test ingestion of invalid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = BytesIO(b"{}")
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
filename="foo",
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)
assert not backend.is_valid()
with pytest.raises(ValidationError):
backend.convert()

View File

@ -2,13 +2,8 @@ import json
import os import os
from pathlib import Path from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import ConversionResult
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
GENERATE = False GENERATE = False
@ -53,7 +48,7 @@ def test_e2e_xlsx_conversions():
converter = get_converter() converter = get_converter()
for xlsx_path in xlsx_paths: for xlsx_path in xlsx_paths:
# print(f"converting {xlsx_path}") print(f"converting {xlsx_path}")
gt_path = ( gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name

View File

@ -3,23 +3,16 @@
import json import json
import logging import logging
import os import os
import unittest
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
import pytest import pytest
import yaml
from docling_core.types import DoclingDocument from docling_core.types import DoclingDocument
from docling_core.types.doc import DocItemLabel, TableData, TextItem from docling_core.types.doc import DocItemLabel, TableData, TextItem
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import InputDocument
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
GENERATE: bool = True GENERATE: bool = True
DATA_PATH: Path = Path("./tests/data/uspto/") DATA_PATH: Path = Path("./tests/data/uspto/")

View File

@ -1,5 +1,4 @@
import json import json
import logging
import os import os
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path

View File

@ -0,0 +1,66 @@
from pathlib import Path
from docling_core.types.doc import CodeItem, TextItem
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
def get_converter():
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = False
pipeline_options.do_code_enrichment = True
pipeline_options.do_formula_enrichment = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
backend=DoclingParseV2DocumentBackend,
pipeline_cls=StandardPdfPipeline,
pipeline_options=pipeline_options,
)
}
)
return converter
def test_code_and_formula_conversion():
pdf_path = Path("tests/data/code_and_formula.pdf")
converter = get_converter()
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert(pdf_path)
results = doc_result.document.texts
code_blocks = [el for el in results if isinstance(el, CodeItem)]
assert len(code_blocks) == 1
gt = 'public static void print() {\n System.out.println("Java Code");\n}'
predicted = code_blocks[0].text.strip()
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
assert code_blocks[0].code_language == CodeLanguageLabel.JAVA
formula_blocks = [
el
for el in results
if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA
]
assert len(formula_blocks) == 1
gt = "a ^ { 2 } + 8 = 1 2"
predicted = formula_blocks[0].text
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"

View File

@ -0,0 +1,81 @@
from pathlib import Path
from docling_core.types.doc import PictureClassificationData
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
def get_converter():
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = False
pipeline_options.do_code_enrichment = False
pipeline_options.do_formula_enrichment = False
pipeline_options.do_picture_classification = True
pipeline_options.generate_picture_images = True
pipeline_options.images_scale = 2
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
backend=DoclingParseV2DocumentBackend,
pipeline_cls=StandardPdfPipeline,
pipeline_options=pipeline_options,
)
}
)
return converter
def test_picture_classifier():
pdf_path = Path("tests/data/picture_classification.pdf")
converter = get_converter()
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert(pdf_path)
results = doc_result.document.pictures
assert len(results) == 2
res = results[0]
assert len(res.annotations) == 1
assert type(res.annotations[0]) == PictureClassificationData
classification_data = res.annotations[0]
assert classification_data.provenance == "DocumentPictureClassifier"
assert (
len(classification_data.predicted_classes) == 16
), "Number of predicted classes is not equal to 16"
confidences = [pred.confidence for pred in classification_data.predicted_classes]
assert confidences == sorted(
confidences, reverse=True
), "Predictions are not sorted in descending order of confidence"
assert (
classification_data.predicted_classes[0].class_name == "bar_chart"
), "The prediction is wrong for the bar chart image."
res = results[1]
assert len(res.annotations) == 1
assert type(res.annotations[0]) == PictureClassificationData
classification_data = res.annotations[0]
assert classification_data.provenance == "DocumentPictureClassifier"
assert (
len(classification_data.predicted_classes) == 16
), "Number of predicted classes is not equal to 16"
confidences = [pred.confidence for pred in classification_data.predicted_classes]
assert confidences == sorted(
confidences, reverse=True
), "Predictions are not sorted in descending order of confidence"
assert (
classification_data.predicted_classes[0].class_name == "map"
), "The prediction is wrong for the bar chart image."

View File

@ -62,6 +62,7 @@ def test_e2e_conversions():
TesseractOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True), TesseractCliOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
RapidOcrOptions(force_full_page_ocr=True), RapidOcrOptions(force_full_page_ocr=True),
] ]

View File

@ -124,6 +124,25 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8") doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None assert dci._guess_format(doc_path) == None
# Valid Docling JSON
test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
# Non-Docling JSON
# TODO: Docling JSON is currently the single supported JSON flavor and the pipeline
# will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper
# disambiguation seen as part of https://github.com/DS4SD/docling/issues/802
test_str = "{}"
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
def _make_input_doc(path): def _make_input_doc(path):
in_doc = InputDocument( in_doc = InputDocument(