mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge remaining changes from main
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
dac82ca7f2
commit
fa5d972291
@ -1,3 +1,9 @@
|
||||
## [v1.20.0](https://github.com/DS4SD/docling/releases/tag/v1.20.0) - 2024-10-11
|
||||
|
||||
### Feature
|
||||
|
||||
* New experimental docling-parse v2 backend ([#131](https://github.com/DS4SD/docling/issues/131)) ([`5e4944f`](https://github.com/DS4SD/docling/commit/5e4944f15f0ac1faf3e6a532c3e3ab4da56517a3))
|
||||
|
||||
## [v1.19.1](https://github.com/DS4SD/docling/releases/tag/v1.19.1) - 2024-10-11
|
||||
|
||||
### Fix
|
||||
|
@ -1,6 +1,6 @@
|
||||
<p align="center">
|
||||
<a href="https://github.com/ds4sd/docling">
|
||||
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
|
||||
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@ -201,8 +201,8 @@ To see all available options (export formats etc.) run `docling --help`.
|
||||
|
||||
### RAG
|
||||
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
||||
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
||||
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
||||
- [Basic RAG pipeline with LlamaIndex 🦙](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_llamaindex.ipynb)
|
||||
- [Basic RAG pipeline with LangChain 🦜🔗](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_langchain.ipynb)
|
||||
|
||||
## Advanced features
|
||||
|
||||
|
237
docling/backend/docling_parse_v2_backend.py
Normal file
237
docling/backend/docling_parse_v2_backend.py
Normal file
@ -0,0 +1,237 @@
|
||||
import logging
|
||||
import random
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_parse.docling_parse import pdf_parser_v2
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell, Size
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
def __init__(
|
||||
self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
|
||||
):
|
||||
self._ppage = page_obj
|
||||
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
||||
|
||||
self.valid = "pages" in parsed_page
|
||||
if self.valid:
|
||||
self._dpage = parsed_page["pages"][page_no]
|
||||
else:
|
||||
_log.info(
|
||||
f"An error occured when loading page {page_no} of document {document_hash}."
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.valid:
|
||||
return ""
|
||||
# Find intersecting cells on the page
|
||||
text_piece = ""
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
scale = (
|
||||
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
||||
)
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
|
||||
cell_bbox = BoundingBox(
|
||||
l=x0 * scale * page_size.width / parser_width,
|
||||
b=y0 * scale * page_size.height / parser_height,
|
||||
r=x1 * scale * page_size.width / parser_width,
|
||||
t=y1 * scale * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
).to_top_left_origin(page_height=page_size.height * scale)
|
||||
|
||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
||||
|
||||
if overlap_frac > 0.5:
|
||||
if len(text_piece) > 0:
|
||||
text_piece += " "
|
||||
text_piece += cell_data[cells_header.index("text")]
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
cells: List[Cell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
Cell(
|
||||
id=cell_counter,
|
||||
text=text_piece,
|
||||
bbox=BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
|
||||
images = self._dpage["sanitized"]["images"]["data"]
|
||||
images_header = self._dpage["sanitized"]["images"]["header"]
|
||||
|
||||
for row in images:
|
||||
x0 = row[images_header.index("x0")]
|
||||
y0 = row[images_header.index("y0")]
|
||||
x1 = row[images_header.index("x1")]
|
||||
y1 = row[images_header.index("y1")]
|
||||
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
(x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(self.get_size().height)
|
||||
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
yield cropbox
|
||||
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
cropbox = BoundingBox(
|
||||
l=0,
|
||||
r=page_size.width,
|
||||
t=0,
|
||||
b=page_size.height,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
padbox = BoundingBox(
|
||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||
)
|
||||
else:
|
||||
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
||||
padbox.r = page_size.width - padbox.r
|
||||
padbox.t = page_size.height - padbox.t
|
||||
|
||||
image = (
|
||||
self._ppage.render(
|
||||
scale=scale * 1.5,
|
||||
rotation=0, # no additional rotation
|
||||
crop=padbox.as_tuple(),
|
||||
)
|
||||
.to_pil()
|
||||
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
||||
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||
|
||||
return image
|
||||
|
||||
def get_size(self) -> Size:
|
||||
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
|
||||
def unload(self):
|
||||
self._ppage = None
|
||||
self._dpage = None
|
||||
|
||||
|
||||
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
self.parser = pdf_parser_v2("fatal")
|
||||
|
||||
success = False
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
success = self.parser.load_document_from_bytesio(
|
||||
self.document_hash, path_or_stream
|
||||
)
|
||||
elif isinstance(path_or_stream, Path):
|
||||
success = self.parser.load_document(self.document_hash, str(path_or_stream))
|
||||
|
||||
if not success:
|
||||
raise RuntimeError(
|
||||
f"docling-parse v2 could not load document {self.document_hash}."
|
||||
)
|
||||
|
||||
def page_count(self) -> int:
|
||||
return len(self._pdoc) # To be replaced with docling-parse API
|
||||
|
||||
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
||||
return DoclingParseV2PageBackend(
|
||||
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.page_count() > 0
|
||||
|
||||
def unload(self):
|
||||
super().unload()
|
||||
self.parser.unload_document(self.document_hash)
|
||||
self._pdoc.close()
|
||||
self._pdoc = None
|
@ -1,19 +1,20 @@
|
||||
import logging
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TesseractOcrModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
||||
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: TesseractCliOcrOptions
|
||||
self.options: TesseractOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
self.reader = None
|
||||
|
BIN
docs/assets/logo.png
Normal file
BIN
docs/assets/logo.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 258 KiB |
116
docs/assets/logo.svg
Normal file
116
docs/assets/logo.svg
Normal file
@ -0,0 +1,116 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg width="100%" height="100%" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1.5;">
|
||||
<g id="Docling" transform="matrix(1.07666,0,0,1.07666,-35.9018,-84.1562)">
|
||||
<g id="Outline" transform="matrix(1,0,0,1,-0.429741,55.0879)">
|
||||
<path d="M394.709,69.09C417.34,35.077 467.97,30.178 478.031,55.609C486.35,55.043 494.726,54.701 503.158,54.589C533.157,45.238 560.496,47.419 584.65,60.732C800.941,96.66 966.069,284.814 966.069,511.232C966.069,763.284 761.435,967.918 509.383,967.918C433.692,967.918 362.277,949.464 299.385,916.808L242.3,931.993C203.092,943.242 187.715,928.369 208.575,891.871C208.935,891.24 216.518,879.37 223.997,867.677C119.604,783.975 52.698,655.355 52.698,511.232C52.698,298.778 198.086,120.013 394.709,69.09Z" style="fill:white;"/>
|
||||
</g>
|
||||
<g id="Color" transform="matrix(1.02317,0,0,1.02317,-11.55,-17.8333)">
|
||||
<path d="M284.8,894.232L179.735,783.955L130.222,645.203L125.538,504.726L185.211,385.816C209.006,322.738 249.951,278.973 302.281,248.028L406.684,203.333L413.483,175.767L436.637,152.428L451.408,153.312L457.726,183.183L485.164,165.379L526.92,159.699L557.014,177.545L612.652,211.018C679.009,226.066 740.505,264.146 797.138,325.26L862.813,423.477L891.583,560.826L883.273,683.32L814.268,809.924L734.431,894.384L644.495,926.906L497.146,954.121L361.064,940.647L284.8,894.232Z" style="fill:url(#_Linear1);"/>
|
||||
<path d="M699.932,887.255L634.427,825.291L597.884,782.352L594.906,738.956L610.14,709.396L643.207,699.954L685,710.111L730.425,736.425L765.204,778.79L775.166,849.531L719.381,894.082L699.932,887.255Z" style="fill:url(#_Linear2);"/>
|
||||
<g transform="matrix(-0.765945,0,0,1,839.727,5.47434)">
|
||||
<clipPath id="_clip3">
|
||||
<path d="M699.932,887.255L634.427,825.291L597.884,782.352L594.906,738.956L610.14,709.396L643.207,699.954L685,710.111L730.425,736.425L765.204,778.79L775.166,849.531L719.381,894.082L699.932,887.255Z"/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#_clip3)">
|
||||
<g transform="matrix(-1.18516,0,0,0.907769,1039.04,88.3496)">
|
||||
<use xlink:href="#_Image4" x="223.969" y="674.21" width="152.098px" height="213.852px" transform="matrix(0.994105,0,0,0.999308,0,0)"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<path d="M311.699,713.521C189.178,639.091 164.299,526.77 191.824,394.113L135.136,476.434L122.004,547.53C143.022,614.014 174.522,676.199 225.005,730.598C210.601,754.156 201.894,776.601 197.955,798.114L245.803,841.67C247.274,812.1 254.934,783.047 270.614,754.664L311.699,713.521Z" style="fill-opacity:0.22;"/>
|
||||
<g transform="matrix(-1,0,0,1,1022.04,2.74442)">
|
||||
<path d="M311.699,713.521C189.178,639.091 164.299,526.77 191.824,394.113L135.136,476.434L122.004,547.53C143.022,614.014 174.522,676.199 225.005,730.598C210.601,754.156 201.894,776.601 197.955,798.114L245.803,841.67C247.274,812.1 254.934,783.047 270.614,754.664L311.699,713.521Z" style="fill-opacity:0.22;"/>
|
||||
</g>
|
||||
<path d="M354.92,650.818L420.009,663.185L493.368,666.379L554.826,665.251L620.19,658.511L658.169,651.428L671.428,644.802L673.265,627.093L659.898,611.845L625.422,609.244L599.275,591.212L568.632,556.79L542.9,534.336L515.052,528.253L480.412,532.71L455.2,552.337L428.514,578.155L405.312,599.359L374.228,612.097L355.342,614.456L340.75,630.308L341.568,645.341L354.92,650.818Z" style="fill:url(#_Linear5);"/>
|
||||
<path d="M257.168,949.32L317.434,876.747L364.928,810.6L384.1,743.934L378.759,714.719L376.844,685.849L374.836,659.954L448.734,664.2L511.462,667.602L571.339,665.091L632.796,658.836L648.232,656.882L649.937,697.808L608.105,717.702L598.45,738.594L592.286,761.642L604.743,796.309L639.595,825.803L649.872,840.757L558.219,895.152L502.124,907.569L425.781,923.496L333.29,931.298L286.269,936.907L257.168,949.32Z" style="fill:url(#_Linear6);"/>
|
||||
<g transform="matrix(1,0,0,1.30081,-1.77636e-15,-196.488)">
|
||||
<path d="M374.165,685.268C463.946,706.599 553.728,707.491 643.51,688.593L641.903,653.199C549.263,671.731 459.645,672.22 373.059,654.611L374.165,685.268Z" style="fill-opacity:0.18;"/>
|
||||
</g>
|
||||
<path d="M459.633,571.457C476.7,536.091 530.064,535.913 553.1,568.767C520.703,551.407 489.553,552.374 459.633,571.457Z" style="fill:white;"/>
|
||||
<g transform="matrix(1,0,0,1,0.223468,-2.61949)">
|
||||
<path d="M355.3,267.232C500.64,173.156 720.699,241.362 793.691,423.582C766.716,384.84 735.725,357.078 697.53,349.014L717.306,335.248C698.537,321.49 675.794,320.957 651.039,327.119C652.235,315.768 658.995,306.991 674.188,302.115C641.864,287.427 617.356,289.473 596.258,298.818C597.049,286.116 605.827,278.087 620.068,273.254C589.192,267.477 564.13,270.926 544.651,283.232C545.822,271.831 550.709,260.943 560.913,250.79C517.498,257.095 492.995,267.925 482.892,282.202C477.311,269.499 477.274,257.221 487.625,245.739C439.161,252.932 421.555,265.094 410.355,278.286C407.697,269.01 407.705,260.632 410.853,253.316C389.633,254.773 372.178,260.663 355.3,267.232Z" style="fill:rgb(255,213,95);"/>
|
||||
</g>
|
||||
<path d="M475.656,209.175C479.639,175.037 503.437,173.299 532.412,180.026C507.242,183.404 486.969,195.251 473.705,219.215L475.656,209.175Z" style="fill:rgb(255,215,101);"/>
|
||||
<g transform="matrix(0.114323,-0.655229,0.82741,0.144365,224.632,497.317)">
|
||||
<path d="M475.656,209.175C479.639,175.037 503.437,173.299 532.412,180.026C507.242,183.404 486.969,195.251 473.705,219.215L475.656,209.175Z" style="fill:rgb(255,215,101);"/>
|
||||
</g>
|
||||
<g transform="matrix(1.6739,1.15217e-16,-1.15217e-16,-0.733075,-341.46,1039.77)">
|
||||
<path d="M447.449,560.911C468.179,536.963 546.237,539.305 565.638,560.831C533.166,555.541 477.296,553.494 447.449,560.911Z" style="fill:white;"/>
|
||||
</g>
|
||||
<path d="M348.201,622.341C395.549,653.534 622.351,660.854 661.936,616.729L677.568,633.834L667.044,650.308L557.802,667.518L498.074,670.562L446.718,666.416L391.404,658.406L348.154,652.501L340.161,637.119L348.201,622.341Z" style="fill:rgb(199,68,6);"/>
|
||||
</g>
|
||||
<g id="Black-outline" serif:id="Black outline" transform="matrix(1.02317,0,0,1.02317,-11.55,-17.8333)">
|
||||
<path d="M373.389,657.919C376.285,676.334 377.04,695.016 375.326,714.008" style="fill:none;stroke:black;stroke-width:15.73px;"/>
|
||||
<path d="M645.931,654.961C646.158,669.958 647.22,684.853 648.975,699.661" style="fill:none;stroke:black;stroke-width:15.73px;"/>
|
||||
<path d="M290.084,534.662C276.554,533.535 264.892,530.024 254.279,525.175C276.732,555.341 305.316,569.76 338.631,572.029L290.084,534.662Z"/>
|
||||
<g transform="matrix(0.94177,0,0,0.94909,28.8868,3.79501)">
|
||||
<ellipse cx="338.022" cy="510.34" rx="88.911" ry="89.412"/>
|
||||
</g>
|
||||
<g transform="matrix(0.112099,0.0552506,-0.0673118,0.136571,455.367,509.409)">
|
||||
<ellipse cx="338.022" cy="510.34" rx="88.911" ry="89.412"/>
|
||||
</g>
|
||||
<g transform="matrix(-0.112099,0.0552506,0.0673118,0.136571,560.529,509.492)">
|
||||
<ellipse cx="338.022" cy="510.34" rx="88.911" ry="89.412"/>
|
||||
</g>
|
||||
<g transform="matrix(-1,0,0,1,1013.33,-1.15187)">
|
||||
<path d="M290.084,534.662C276.554,533.535 264.892,530.024 254.279,525.175C276.732,555.341 305.316,569.76 338.631,572.029L290.084,534.662Z"/>
|
||||
</g>
|
||||
<g transform="matrix(-0.94177,0,0,0.94909,984.44,2.64314)">
|
||||
<ellipse cx="338.022" cy="510.34" rx="88.911" ry="89.412"/>
|
||||
</g>
|
||||
<g transform="matrix(1,0,0,1,1.9047,-5.57346)">
|
||||
<path d="M277.021,489.604C279.828,554.545 355.855,583.508 405.306,537.851C354.458,599.537 263.881,560.914 277.021,489.604Z" style="fill:white;"/>
|
||||
</g>
|
||||
<g transform="matrix(-1,0,0,1,1011.43,-5.7284)">
|
||||
<path d="M277.021,489.604C279.828,554.545 355.855,583.508 405.306,537.851C354.458,599.537 263.881,560.914 277.021,489.604Z" style="fill:white;"/>
|
||||
</g>
|
||||
<g transform="matrix(0.973815,0,0,1.00246,4.71761,-0.508759)">
|
||||
<path d="M407.22,206.891C107.655,339.384 134.447,630.03 314.615,708.305" style="fill:none;stroke:black;stroke-width:29.39px;"/>
|
||||
</g>
|
||||
<g transform="matrix(-0.973815,0,0,1.00246,1006.67,-1.31695)">
|
||||
<path d="M461.559,196.756C119.768,256.762 111.059,642.544 320.305,711.486" style="fill:none;stroke:black;stroke-width:29.39px;"/>
|
||||
</g>
|
||||
<g id="vector-duck" serif:id="vector duck">
|
||||
<path d="M240.912,850.71C248.043,740.231 325.609,685.992 371.268,715.193C386.487,724.926 392.506,757.72 358.575,816.753C327.005,871.68 300.465,894.596 288.329,903.447" style="fill:none;stroke:black;stroke-width:21.79px;"/>
|
||||
<path d="M638.382,843.426C427.991,964.695 389.022,902.942 251.512,947.641L307.759,889.573" style="fill:none;stroke:black;stroke-width:15.73px;"/>
|
||||
<path d="M770.991,853.754C779.364,764.998 730.67,727.923 666.385,704.966C629.568,691.819 580.483,723.886 595.974,772.596C606.285,805.016 650.54,839.029 707.786,886.778" style="fill:none;stroke:black;stroke-width:21.79px;"/>
|
||||
<g transform="matrix(1,0,0,1,-1.87208,0.908099)">
|
||||
<path d="M603.287,772.415C614.237,757.963 627.553,750.285 642.878,748.352C628.356,760.968 617.23,775.676 620.632,799.336C635.815,785.15 650.367,779.457 664.396,780.801C651.715,790.7 639.329,803.279 641.039,818.089C641.247,819.891 647.043,823.996 647.595,825.837C659.897,816.37 672.867,811.065 689.234,809.472C676.577,822.659 668.021,834.011 674.478,848.729L664.333,847.825L625.643,812.604L603.629,786.218L603.287,772.415Z"/>
|
||||
</g>
|
||||
<g transform="matrix(-0.969851,0.2437,0.2437,0.969851,773.329,-138.212)">
|
||||
<path d="M603.287,772.415C614.237,757.963 627.553,750.285 642.878,748.352C628.356,760.968 617.23,775.676 620.632,799.336C635.815,785.15 650.367,779.457 664.396,780.801C651.715,790.7 639.329,803.279 641.039,818.089C641.247,819.891 647.043,823.996 647.595,825.837C659.897,816.37 672.867,811.065 689.234,809.472C676.577,822.659 668.021,834.011 674.478,848.729L664.333,847.825L625.643,812.604L603.629,786.218L603.287,772.415Z"/>
|
||||
</g>
|
||||
<path d="M511.787,670.044C461.061,671.835 411.878,662.84 361.322,653.92C329.071,648.229 335.56,616.432 361.693,615.181C391.498,613.754 411.83,601.737 437.593,569.084C459.063,541.872 482.443,528.143 506.834,529.767" style="fill:none;stroke:black;stroke-width:15.73px;"/>
|
||||
<g transform="matrix(-1,0,0,1,1014.44,-0.213451)">
|
||||
<path d="M511.787,670.044C461.061,671.835 411.878,662.84 361.322,653.92C329.071,648.229 335.56,616.432 361.693,615.181C391.498,613.754 411.83,601.737 437.593,569.084C459.063,541.872 482.443,528.143 506.834,529.767" style="fill:none;stroke:black;stroke-width:15.73px;"/>
|
||||
</g>
|
||||
</g>
|
||||
<g transform="matrix(2.4586,0,0,2.5497,-444.527,-690.434)">
|
||||
<ellipse cx="312.566" cy="450.751" rx="10.63" ry="10.48" style="fill:white;"/>
|
||||
</g>
|
||||
<g transform="matrix(2.4586,0,0,2.5497,-127.75,-690.991)">
|
||||
<ellipse cx="312.566" cy="450.751" rx="10.63" ry="10.48" style="fill:white;"/>
|
||||
</g>
|
||||
<path d="M505.738,698.061L578.879,713.989" style="fill:none;stroke:black;stroke-width:12.1px;"/>
|
||||
<path d="M422.781,709.6L568.438,743.041" style="fill:none;stroke:black;stroke-width:12.1px;"/>
|
||||
<path d="M419.941,738.409L565.688,772.989" style="fill:none;stroke:black;stroke-width:12.1px;"/>
|
||||
<path d="M408.6,787.08L510.634,810.689" style="fill:none;stroke:black;stroke-width:12.1px;"/>
|
||||
<path d="M397.571,815.956L500.93,840.219" style="fill:none;stroke:black;stroke-width:12.1px;"/>
|
||||
<path d="M386.763,844.926L454.065,861.974" style="fill:none;stroke:black;stroke-width:12.1px;"/>
|
||||
<path d="M459.169,919.169C512.194,898.262 539.171,867.298 535.241,824.402C568.052,818.31 598.499,817.058 625.84,822.165" style="fill:none;stroke:black;stroke-width:16.95px;"/>
|
||||
<path d="M366.219,241.106C389.605,229.261 413.371,220.601 438.247,217.5C416.795,202.419 418.72,174.582 444.22,162.47C442.086,178.175 447.633,193.354 464.772,207.738C468.721,167.57 530.015,162.087 545.674,184.112C526.45,189.314 513.082,197.344 504.566,207.717C522.403,208.119 540.706,207.86 556.2,210.609L566.935,168.471C536.388,146.208 495.718,142.166 464.65,166.705C467.703,133.264 419.536,128.364 404.624,178.47L366.219,241.106Z"/>
|
||||
<path d="M392.617,924.576C428.953,936.938 467.84,943.636 508.258,943.636C708.944,943.636 871.876,778.49 871.876,575.076C871.876,382.463 725.788,224.162 539.898,207.895L554.137,173.696L554.485,168.187C757.218,191.602 914.895,366.003 914.895,577.383C914.895,804.698 732.549,989.249 507.949,989.249C435.381,989.249 367.223,969.983 308.199,936.232L392.617,924.576ZM279.206,917.988C171.663,843.819 101.002,718.887 101.002,577.383C101.002,383.006 234.333,219.898 413.398,176.712L424.375,216.389C264.082,254.803 144.64,400.913 144.64,575.076C144.64,703.735 209.822,817.086 308.514,883.023L279.206,917.988Z"/>
|
||||
<path d="M714.938,895.223L647.287,836.693L616.06,855.308L549.158,889.412L459.845,919.216L390.213,928.828L429.291,950.712L535.832,960.1L586.137,952.591L662.254,931.896L714.938,895.223Z"/>
|
||||
<path d="M423.538,929.39C509.164,917.593 580.815,890.465 640.827,850.566C635.677,886.828 622.639,918.218 594.006,939.977C530.254,930.953 474.955,928.632 423.538,929.39Z" style="fill:url(#_Linear7);"/>
|
||||
</g>
|
||||
</g>
|
||||
<defs>
|
||||
<linearGradient id="_Linear1" x1="0" y1="0" x2="1" y2="0" gradientUnits="userSpaceOnUse" gradientTransform="matrix(-52.3962,375.121,-375.121,-52.3962,471.134,384.463)"><stop offset="0" style="stop-color:rgb(255,176,44);stop-opacity:1"/><stop offset="1" style="stop-color:rgb(255,73,2);stop-opacity:1"/></linearGradient>
|
||||
<linearGradient id="_Linear2" x1="0" y1="0" x2="1" y2="0" gradientUnits="userSpaceOnUse" gradientTransform="matrix(28.6198,-84.8913,84.8913,28.6198,647.831,831.55)"><stop offset="0" style="stop-color:rgb(255,73,2);stop-opacity:1"/><stop offset="1" style="stop-color:rgb(255,176,44);stop-opacity:1"/></linearGradient>
|
||||
<image id="_Image4" width="153px" height="214px" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAUDBAQEAwUEBAQFBQUGBwwIBwcHBw8LCwkMEQ8SEhEPERETFhwXExQaFRERGCEYGh0dHx8fExciJCIeJBweHx7/2wBDAQUFBQcGBw4ICA4eFBEUHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh4eHh7/wAARCADWAJkDAREAAhEBAxEB/8QAGQABAQEBAQEAAAAAAAAAAAAAAwACBwYF/8QAGBABAQEBAQAAAAAAAAAAAAAAAgABEhH/xAAbAQADAQADAQAAAAAAAAAAAAAAAQMCBAUGB//EABYRAQEBAAAAAAAAAAAAAAAAAAABEf/aAAwDAQACEQMRAD8A63fAX1BQFAUBQFAUBQFAUBQFAZShqQSUNyBSmpIJK0pIJqakgUptyCampIx1DWPS0XSqAoCgKAoCgKAoCgKAwlDUgkobkE1aVkClNuQbU1JAtTUkElNSQTU25GOptY9Vcd0KgKAoCgKAoCgKAoDDUNSCSmpIJqakgUptyCampIJKakgWpqSCSm3IJKakjHU2sewuM86oCgKAoCgKAoCgMJQ1IJqakgWpqSCSmpIJqbcgUpqSCSmpIJqbcgmpqSCSmpIx1PGse1uK8yoCgKAoCgKAoA2obkGlNuQLU1JBJTUkElPG5AtTUkElNSQSU1JBNTbkClNSQSU1JGOptY93cR5VQFAUBQFAUAbUNyCam3IJKakgmpqSBampIJKbcgmpqSBampIJKakgmptyBampINKakjHU8ax0C4byKgKAoCgLd8gDShuQTU25AtTVkElNuQTU1JBNTUkElNuQLU1JBNTUkElNSQLU25BJWlJBJTUkY6hrHRrhPGqAoCgLd8gDahuQSU1JAtTUkE1NuQSU1JBNTUkClNSQSU25BNTUkC1NSQSVpSQSUNyCatKSBSmpIx1DWOmXBeJUBQFu+QBtQ3IFqakgkpqSCam3IFqakgkpqSCampIJqbcgUpqSCampIJq0pIJKakgWptyCampIJKakjHU2sdRuveFUBbvkASUNyCSmpIJqakgkpqSBam3IJqakgkpqSCam3IFqakgmpqSCampIFq0pIJKbcgkpqSBSmpIJKakjHUNY6vde8Ct3yAJKG5BNTUkE1NSQLU1JBJTUkE1NuQLU1JBJWlJBJQpIJq03IFKakgkp4pIJqakgmptyBSmpIJqakgkpqSMdQeOt7vl1z5/INKG5BNTbkClPFJBJTUkE1NSQKU1JBJTbkE1NSQLU1JBtTbkC1aUkE1NSQSU1JAtTUkElNuQSU1JBJTUkC1NSRjqbWOupXWPnsgmpqSBSmpIJqbcgkpqSBampIJK0pIJKbcgWoUkE1aUkElNSQTU25ApTUkElNSQSU25AtTUkElNSQTU1JApTUkZ6g8dcautfPpBJTUkE1NSQLU25BJTUkE1aUkC1NuQSU1JBNTUkClNSQSU25BJTUkE1NSQSU1JAtTbkE1NSQSU8UkClNSQe77NtQHWErrXgJBJTUkE1NuQSU8UkClNSQTVpuQSU1JBJTUkC1NSQTU1JBJTbkE1NSQKU1JBJTUkElNuQLU1JBJTUkHu+zbUBQHU2rrnhJBJWlJAtTbkElNSQTU1JBJTbkC1NSQSU1JBNTUkElNuQKU1JBJTUkE1NSQSU1JAtTbkElNSQe77NtQFAUB01q694iQSU1JBNWm5BNTUkClNSQTU25BJTUkElNSQKU25BNTVkElNuQTU1JApTUkElNSQSU25B7vs21AUBQFAdIauC8ZIJKeNyCampIFKakgmp4pIJKbcgWpqSCSnikgmpqSCSm3IFKakgkpqSCampIFKakjG77NpQFAUBQFAdCauE8fIJKakgkpqSCampIFKakgkptyCSmpIFqakg0ptyBampIJqakgWpqSCSmpIxNpQFAUBQFAUB71q4bycgkpqSBampIJKakgmpqSCSm3IFqakgkpqSCSmpIJqbcgWrSkgkoUkYm0oCgKAoCgKAoD3CVxHl5AtTUkElNSQTU1JApTbkElNSQSU1JApTUkElNuQTU1JBJWlJGIaUBQFAUBQFAUBQHsmrivNyBSmpIJKakgkptyCatKyCSm3IFqFJBNWlJBJTUkElNuRiGlAUBQFAUBQFAUBQHrErjPPyCampIJKakgmpqSBatNyCShSQTU1JAtWlJBJQ3IzNpQFAUBQFAUBQFAUBQHp2rjujkElaUkClNSQTU25BJTUkElCkgWrSkgkpqSMwagKAoCgKAoCgKAoCgKA9ElQdPIFq0pIJKakgmobkC1aUkElNSQSU1JGYNQFAUBQFAUBQFAUBQFAUB9xqk6uQTU1JApTxSQTUNyBatKSDSmpIzBqAoCgKAoCgKAoCgKAoCgKA+u1TdfIFKcUkE1NuQTU1JBLZqSMwagKAoCgKAoCgKAoCgKAoCgKA/9k="/>
|
||||
<linearGradient id="_Linear5" x1="0" y1="0" x2="1" y2="0" gradientUnits="userSpaceOnUse" gradientTransform="matrix(-39.3403,137.423,-137.423,-39.3403,545.523,573.246)"><stop offset="0" style="stop-color:rgb(255,200,41);stop-opacity:1"/><stop offset="1" style="stop-color:rgb(255,73,2);stop-opacity:1"/></linearGradient>
|
||||
<linearGradient id="_Linear6" x1="0" y1="0" x2="1" y2="0" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.01113,-68.2054,68.2054,1.01113,482.996,741.463)"><stop offset="0" style="stop-color:white;stop-opacity:1"/><stop offset="1" style="stop-color:rgb(179,179,179);stop-opacity:1"/></linearGradient>
|
||||
<linearGradient id="_Linear7" x1="0" y1="0" x2="1" y2="0" gradientUnits="userSpaceOnUse" gradientTransform="matrix(-7.13599,-34.117,34.117,-7.13599,578.793,922.144)"><stop offset="0" style="stop-color:rgb(164,164,164);stop-opacity:1"/><stop offset="1" style="stop-color:rgb(106,106,106);stop-opacity:1"/></linearGradient>
|
||||
</defs>
|
||||
</svg>
|
After Width: | Height: | Size: 18 KiB |
@ -122,7 +122,7 @@ def main():
|
||||
raises_on_error=False, # to let conversion run through all and examine results at the end
|
||||
)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
conv_results, output_dir=Path("../../examples/scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
@ -113,7 +113,7 @@ def main():
|
||||
_log.info(f"Document converted in {end_time:.2f} seconds.")
|
||||
|
||||
## Export results
|
||||
output_dir = Path("./scratch")
|
||||
output_dir = Path("../../examples/scratch")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
doc_filename = conv_result.input.file.stem
|
||||
|
@ -15,7 +15,7 @@ def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
output_dir = Path("../../examples/scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
@ -20,7 +20,7 @@ def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
output_dir = Path("../../examples/scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
@ -13,7 +13,7 @@ def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
output_dir = Path("../../examples/scratch")
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
369
docs/examples/rag_langchain.ipynb
Normal file
369
docs/examples/rag_langchain.ipynb
Normal file
@ -0,0 +1,369 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RAG with LangChain 🦜🔗"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# requirements for this example:\n",
|
||||
"%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
|
||||
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Loader and splitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below we set up:\n",
|
||||
"- a `Loader` which will be used to create LangChain documents, and\n",
|
||||
"- a splitter, which will be used to split these documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from enum import Enum\n",
|
||||
"from typing import Iterator\n",
|
||||
"\n",
|
||||
"from langchain_core.document_loaders import BaseLoader\n",
|
||||
"from langchain_core.documents import Document as LCDocument\n",
|
||||
"from pydantic import BaseModel\n",
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class DocumentMetadata(BaseModel):\n",
|
||||
" dl_doc_hash: str\n",
|
||||
" # source: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class DoclingPDFLoader(BaseLoader):\n",
|
||||
" class ParseType(str, Enum):\n",
|
||||
" MARKDOWN = \"markdown\"\n",
|
||||
" # JSON = \"json\"\n",
|
||||
"\n",
|
||||
" def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
|
||||
" self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
|
||||
" self._parse_type = parse_type\n",
|
||||
" self._converter = DocumentConverter()\n",
|
||||
"\n",
|
||||
" def lazy_load(self) -> Iterator[LCDocument]:\n",
|
||||
" for source in self._file_paths:\n",
|
||||
" dl_doc = self._converter.convert_single(source).output\n",
|
||||
" match self._parse_type:\n",
|
||||
" case self.ParseType.MARKDOWN:\n",
|
||||
" text = dl_doc.export_to_markdown()\n",
|
||||
" # case self.ParseType.JSON:\n",
|
||||
" # text = dl_doc.model_dump_json()\n",
|
||||
" case _:\n",
|
||||
" raise RuntimeError(\n",
|
||||
" f\"Unexpected parse type encountered: {self._parse_type}\"\n",
|
||||
" )\n",
|
||||
" lc_doc = LCDocument(\n",
|
||||
" page_content=text,\n",
|
||||
" metadata=DocumentMetadata(\n",
|
||||
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
|
||||
" ).model_dump(),\n",
|
||||
" )\n",
|
||||
" yield lc_doc"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "1b38d07d5fed4618a44ecf261e1e5c44",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
||||
"\n",
|
||||
"loader = DoclingPDFLoader(\n",
|
||||
" file_path=FILE_PATH,\n",
|
||||
" parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
|
||||
")\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(\n",
|
||||
" chunk_size=1000,\n",
|
||||
" chunk_overlap=200,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now used the above-defined objects to get the document splits:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"splits = text_splitter.split_documents(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Vector store"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tempfile import TemporaryDirectory\n",
|
||||
"\n",
|
||||
"from langchain_milvus import Milvus\n",
|
||||
"\n",
|
||||
"MILVUS_URI = os.environ.get(\n",
|
||||
" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"vectorstore = Milvus.from_documents(\n",
|
||||
" splits,\n",
|
||||
" embeddings,\n",
|
||||
" connection_args={\"uri\": MILVUS_URI},\n",
|
||||
" drop_old=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### LLM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
|
||||
"Token is valid (permission: write).\n",
|
||||
"Your token has been saved to /Users/pva/.cache/huggingface/token\n",
|
||||
"Login successful\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_huggingface import HuggingFaceEndpoint\n",
|
||||
"\n",
|
||||
"HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
|
||||
"HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
|
||||
"\n",
|
||||
"llm = HuggingFaceEndpoint(\n",
|
||||
" repo_id=HF_LLM_MODEL_ID,\n",
|
||||
" huggingfacehub_api_token=HF_API_KEY,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## RAG"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Iterable\n",
|
||||
"\n",
|
||||
"from langchain_core.documents import Document as LCDocument\n",
|
||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
||||
"from langchain_core.prompts import PromptTemplate\n",
|
||||
"from langchain_core.runnables import RunnablePassthrough\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def format_docs(docs: Iterable[LCDocument]):\n",
|
||||
" return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"retriever = vectorstore.as_retriever()\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate.from_template(\n",
|
||||
" \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"rag_chain = (\n",
|
||||
" {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
|
||||
" | prompt\n",
|
||||
" | llm\n",
|
||||
" | StrOutputParser()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
436
docs/examples/rag_llamaindex.ipynb
Normal file
436
docs/examples/rag_llamaindex.ipynb
Normal file
@ -0,0 +1,436 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/examples/rag_llamaindex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RAG with LlamaIndex 🦙"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Overview"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This example leverages the official [LlamaIndex Docling extension](../../integrations/llamaindex/).\n",
|
||||
"\n",
|
||||
"Presented extensions `DoclingReader` and `DoclingNodeParser` enable you to:\n",
|
||||
"- use PDF documents in your LLM applications with ease and speed, and\n",
|
||||
"- harness Docling's rich format for advanced, document-native grounding."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- 👉 For best conversion speed, use GPU acceleration whenever available; e.g. if running on Colab, use GPU-enabled runtime.\n",
|
||||
"- Notebook uses HuggingFace's Inference API; for increased LLM quota, token can be provided via env var `HF_TOKEN`.\n",
|
||||
"- Requirements can be installed as shown below (`--no-warn-conflicts` meant for Colab's pre-populated Python env; feel free to remove for stricter usage):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install -q --progress-bar off --no-warn-conflicts llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-readers-file python-dotenv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"from tempfile import mkdtemp\n",
|
||||
"from warnings import filterwarnings\n",
|
||||
"\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def _get_env_from_colab_or_os(key):\n",
|
||||
" try:\n",
|
||||
" from google.colab import userdata\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" return userdata.get(key)\n",
|
||||
" except userdata.SecretNotFoundError:\n",
|
||||
" pass\n",
|
||||
" except ImportError:\n",
|
||||
" pass\n",
|
||||
" return os.getenv(key)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic\")\n",
|
||||
"filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")\n",
|
||||
"# https://github.com/huggingface/transformers/issues/5486:\n",
|
||||
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can now define the main parameters:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
|
||||
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
|
||||
"\n",
|
||||
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
|
||||
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n",
|
||||
"GEN_MODEL = HuggingFaceInferenceAPI(\n",
|
||||
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
|
||||
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
|
||||
")\n",
|
||||
"SOURCE = \"https://arxiv.org/pdf/2408.09869\" # Docling Technical Report\n",
|
||||
"QUERY = \"Which are the main AI models in Docling?\"\n",
|
||||
"\n",
|
||||
"embed_dim = len(EMBED_MODEL.get_text_embedding(\"hi\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Markdown export"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To create a simple RAG pipeline, we can:\n",
|
||||
"- define a `DoclingReader`, which by default exports to Markdown, and\n",
|
||||
"- use a standard node parser for these Markdown-based docs, e.g. a `MarkdownNodeParser`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Q: Which are the main AI models in Docling?\n",
|
||||
"A: 1. A layout analysis model, an accurate object-detector for page elements. 2. TableFormer, a state-of-the-art table structure recognition model.\n",
|
||||
"\n",
|
||||
"Sources:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('3.2 AI models\\n\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n",
|
||||
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
|
||||
" 'Header_2': '3.2 AI models'}),\n",
|
||||
" (\"5 Applications\\n\\nThanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.\",\n",
|
||||
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
|
||||
" 'Header_2': '5 Applications'})]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_index.core import StorageContext, VectorStoreIndex\n",
|
||||
"from llama_index.core.node_parser import MarkdownNodeParser\n",
|
||||
"from llama_index.readers.docling import DoclingReader\n",
|
||||
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
||||
"\n",
|
||||
"reader = DoclingReader()\n",
|
||||
"node_parser = MarkdownNodeParser()\n",
|
||||
"\n",
|
||||
"vector_store = MilvusVectorStore(\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
||||
" dim=embed_dim,\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
"index = VectorStoreIndex.from_documents(\n",
|
||||
" documents=reader.load_data(SOURCE),\n",
|
||||
" transformations=[node_parser],\n",
|
||||
" storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
|
||||
" embed_model=EMBED_MODEL,\n",
|
||||
")\n",
|
||||
"result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n",
|
||||
"print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n",
|
||||
"display([(n.text, n.metadata) for n in result.source_nodes])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Docling format"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To leverage Docling's rich native format, we:\n",
|
||||
"- create a `DoclingReader` with JSON export type, and\n",
|
||||
"- employ a `DoclingNodeParser` in order to appropriately parse that Docling format.\n",
|
||||
"\n",
|
||||
"Notice how the sources now also contain document-level grounding (e.g. page number or bounding box information):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Q: Which are the main AI models in Docling?\n",
|
||||
"A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n",
|
||||
"\n",
|
||||
"Sources:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n",
|
||||
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
|
||||
" 'path': '#/main-text/37',\n",
|
||||
" 'heading': '3.2 AI models',\n",
|
||||
" 'page': 3,\n",
|
||||
" 'bbox': [107.36903381347656,\n",
|
||||
" 330.07513427734375,\n",
|
||||
" 506.29705810546875,\n",
|
||||
" 407.3725280761719]}),\n",
|
||||
" ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n",
|
||||
" {'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
|
||||
" 'path': '#/main-text/10',\n",
|
||||
" 'heading': '1 Introduction',\n",
|
||||
" 'page': 1,\n",
|
||||
" 'bbox': [107.33261108398438,\n",
|
||||
" 83.3067626953125,\n",
|
||||
" 504.0033874511719,\n",
|
||||
" 136.45367431640625]})]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_index.node_parser.docling import DoclingNodeParser\n",
|
||||
"\n",
|
||||
"reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)\n",
|
||||
"node_parser = DoclingNodeParser()\n",
|
||||
"\n",
|
||||
"vector_store = MilvusVectorStore(\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
||||
" dim=embed_dim,\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
"index = VectorStoreIndex.from_documents(\n",
|
||||
" documents=reader.load_data(SOURCE),\n",
|
||||
" transformations=[node_parser],\n",
|
||||
" storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
|
||||
" embed_model=EMBED_MODEL,\n",
|
||||
")\n",
|
||||
"result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n",
|
||||
"print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n",
|
||||
"display([(n.text, n.metadata) for n in result.source_nodes])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## With Simple Directory Reader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To demonstrate this usage pattern, we first set up a test document directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"from tempfile import mkdtemp\n",
|
||||
"\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"tmp_dir_path = Path(mkdtemp())\n",
|
||||
"r = requests.get(SOURCE)\n",
|
||||
"with open(tmp_dir_path / f\"{Path(SOURCE).name}.pdf\", \"wb\") as out_file:\n",
|
||||
" out_file.write(r.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Using the `reader` and `node_parser` definitions from any of the above variants, usage with `SimpleDirectoryReader` then looks as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading files: 100%|██████████| 1/1 [00:11<00:00, 11.15s/file]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Q: Which are the main AI models in Docling?\n",
|
||||
"A: The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art table structure recognition model.\n",
|
||||
"\n",
|
||||
"Sources:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.',\n",
|
||||
" {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n",
|
||||
" 'file_name': '2408.09869.pdf',\n",
|
||||
" 'file_type': 'application/pdf',\n",
|
||||
" 'file_size': 5566574,\n",
|
||||
" 'creation_date': '2024-10-09',\n",
|
||||
" 'last_modified_date': '2024-10-09',\n",
|
||||
" 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
|
||||
" 'path': '#/main-text/37',\n",
|
||||
" 'heading': '3.2 AI models',\n",
|
||||
" 'page': 3,\n",
|
||||
" 'bbox': [107.36903381347656,\n",
|
||||
" 330.07513427734375,\n",
|
||||
" 506.29705810546875,\n",
|
||||
" 407.3725280761719]}),\n",
|
||||
" ('With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.',\n",
|
||||
" {'file_path': '/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/tmp4vsev3_r/2408.09869.pdf',\n",
|
||||
" 'file_name': '2408.09869.pdf',\n",
|
||||
" 'file_type': 'application/pdf',\n",
|
||||
" 'file_size': 5566574,\n",
|
||||
" 'creation_date': '2024-10-09',\n",
|
||||
" 'last_modified_date': '2024-10-09',\n",
|
||||
" 'dl_doc_hash': '556ad9e23b6d2245e36b3208758cf0c8a709382bb4c859eacfe8e73b14e635aa',\n",
|
||||
" 'path': '#/main-text/10',\n",
|
||||
" 'heading': '1 Introduction',\n",
|
||||
" 'page': 1,\n",
|
||||
" 'bbox': [107.33261108398438,\n",
|
||||
" 83.3067626953125,\n",
|
||||
" 504.0033874511719,\n",
|
||||
" 136.45367431640625]})]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_index.core import SimpleDirectoryReader\n",
|
||||
"\n",
|
||||
"dir_reader = SimpleDirectoryReader(\n",
|
||||
" input_dir=tmp_dir_path,\n",
|
||||
" file_extractor={\".pdf\": reader},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"vector_store = MilvusVectorStore(\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
||||
" dim=embed_dim,\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
"index = VectorStoreIndex.from_documents(\n",
|
||||
" documents=dir_reader.load_data(SOURCE),\n",
|
||||
" transformations=[node_parser],\n",
|
||||
" storage_context=StorageContext.from_defaults(vector_store=vector_store),\n",
|
||||
" embed_model=EMBED_MODEL,\n",
|
||||
")\n",
|
||||
"result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)\n",
|
||||
"print(f\"Q: {QUERY}\\nA: {result.response.strip()}\\n\\nSources:\")\n",
|
||||
"display([(n.text, n.metadata) for n in result.source_nodes])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -53,7 +53,7 @@ doc_converter = (
|
||||
conv_results = doc_converter.convert_all(input_paths)
|
||||
|
||||
for res in conv_results:
|
||||
out_path = Path("./scratch")
|
||||
out_path = Path("../../examples/scratch")
|
||||
print(
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
29
docs/index.md
Normal file
29
docs/index.md
Normal file
@ -0,0 +1,29 @@
|
||||
# Docling
|
||||
|
||||
<p align="center">
|
||||
<a href="https://ds4sd.github.io/docling/">
|
||||
<img loading="lazy" alt="Docling" src="assets/logo.png" width="150" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
|
||||
[](https://arxiv.org/abs/2408.09869)
|
||||
[](https://pypi.org/project/docling/)
|
||||

|
||||
[](https://python-poetry.org/)
|
||||
[](https://github.com/psf/black)
|
||||
[](https://pycqa.github.io/isort/)
|
||||
[](https://pydantic.dev)
|
||||
[](https://github.com/pre-commit/pre-commit)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
|
||||
|
||||
## Features
|
||||
|
||||
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
||||
* 📑 Understands detailed page layout, reading order and recovers table structures
|
||||
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
||||
* 🔍 Includes OCR support for scanned PDFs
|
||||
* 🤖 Integrates easily with LLM app / RAG frameworks like LlamaIndex 🦙 & LangChain 🦜🔗
|
||||
* 💻 Provides a simple and convenient CLI
|
100
docs/installation.md
Normal file
100
docs/installation.md
Normal file
@ -0,0 +1,100 @@
|
||||
To use Docling, simply install `docling` from your Python package manager, e.g. pip:
|
||||
```bash
|
||||
pip install docling
|
||||
```
|
||||
|
||||
Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 architectures.
|
||||
|
||||
??? "Alternative PyTorch distributions"
|
||||
|
||||
The Docling models depend on the [PyTorch](https://pytorch.org/) library.
|
||||
Depending on your architecture, you might want to use a different distribution of `torch`.
|
||||
For example, you might want support for different accelerator or for a cpu-only version.
|
||||
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
||||
|
||||
One common situation is the installation on Linux systems with cpu-only support.
|
||||
In this case, we suggest the installation of Docling with the following options
|
||||
|
||||
```bash
|
||||
# Example for installing on the Linux cpu-only version
|
||||
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
??? "Alternative OCR engines"
|
||||
|
||||
Docling supports multiple OCR engines for processing scanned documents. The current version provides
|
||||
the following engines.
|
||||
|
||||
| Engine | Installation | Usage |
|
||||
| ------ | ------------ | ----- |
|
||||
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
||||
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
||||
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
||||
|
||||
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
```
|
||||
|
||||
<h3>Tesseract installation</h3>
|
||||
|
||||
[Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
|
||||
on most operating systems. For using this engine with Docling, Tesseract must be installed on your
|
||||
system, using the packaging tool of your choice. Below we provide example commands.
|
||||
After installing Tesseract you are expected to provide the path to its language files using the
|
||||
`TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
|
||||
|
||||
=== "macOS (via [Homebrew](https://brew.sh/))"
|
||||
|
||||
```console
|
||||
brew install tesseract leptonica pkg-config
|
||||
TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
=== "Debian-based"
|
||||
|
||||
```console
|
||||
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
|
||||
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
=== "RHEL"
|
||||
|
||||
```console
|
||||
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
<h3>Linking to Tesseract</h3>
|
||||
The most efficient usage of the Tesseract library is via linking. Docling is using
|
||||
the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
|
||||
|
||||
If you get into installation issues of Tesserocr, we suggest using the following
|
||||
installation options:
|
||||
|
||||
```console
|
||||
pip uninstall tesserocr
|
||||
pip install --no-binary :all: tesserocr
|
||||
```
|
||||
|
||||
## Development setup
|
||||
|
||||
To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
|
||||
|
||||
```bash
|
||||
poetry install --all-extras
|
||||
```
|
25
docs/integrations/llamaindex.md
Normal file
25
docs/integrations/llamaindex.md
Normal file
@ -0,0 +1,25 @@
|
||||
## Get started
|
||||
|
||||
Docling is available as an official LlamaIndex extension!
|
||||
|
||||
To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/)<!--{target="_blank"}-->.
|
||||
|
||||
## Components
|
||||
|
||||
### Docling Reader
|
||||
|
||||
Reads document files and uses Docling to populate LlamaIndex `Document` objects — either serializing Docling's data model (losslessly, e.g. as JSON) or exporting to a simplified format (lossily, e.g. as Markdown).
|
||||
|
||||
- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling)<!--{target="_blank"}-->
|
||||
- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/)<!--{target="_blank"}-->
|
||||
- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-readers-docling/)<!--{target="_blank"}-->
|
||||
- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/readers/llama-index-readers-docling)<!--{target="_blank"}-->
|
||||
|
||||
### Docling Node Parser
|
||||
|
||||
Reads LlamaIndex `Document` objects populated in Docling's format by Docling Reader and, using its knowledge of the Docling format, parses them to LlamaIndex `Node` objects for downstream usage in LlamaIndex applications, e.g. as chunks for embedding.
|
||||
|
||||
- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->
|
||||
- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/)<!--{target="_blank"}-->
|
||||
- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-node-parser-docling/)<!--{target="_blank"}-->
|
||||
- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->
|
7
docs/overrides/main.html
Normal file
7
docs/overrides/main.html
Normal file
@ -0,0 +1,7 @@
|
||||
{% extends "base.html" %}
|
||||
|
||||
{#
|
||||
{% block announce %}
|
||||
<p>🎉 Docling is now officially supported in LlamaIndex! <a href="{{ 'integrations/llamaindex/' | url }}">Check it out</a>!</p>
|
||||
{% endblock %}
|
||||
#}
|
3
docs/stylesheets/extra.css
Normal file
3
docs/stylesheets/extra.css
Normal file
@ -0,0 +1,3 @@
|
||||
[data-md-color-scheme="default"] .md-banner a {
|
||||
color: #5e8bde;
|
||||
}
|
97
mkdocs.yml
Normal file
97
mkdocs.yml
Normal file
@ -0,0 +1,97 @@
|
||||
site_name: Docling
|
||||
site_url: https://ds4sd.github.io/docling/
|
||||
repo_name: DS4SD/docling
|
||||
repo_url: https://github.com/DS4SD/docling
|
||||
|
||||
theme:
|
||||
name: material
|
||||
custom_dir: docs/overrides
|
||||
palette:
|
||||
# Palette toggle for automatic mode
|
||||
- media: "(prefers-color-scheme)"
|
||||
scheme: default
|
||||
primary: black
|
||||
toggle:
|
||||
icon: material/brightness-auto
|
||||
name: Switch to light mode
|
||||
|
||||
# Palette toggle for light mode
|
||||
- media: "(prefers-color-scheme: light)"
|
||||
scheme: default
|
||||
primary: black
|
||||
toggle:
|
||||
icon: material/brightness-7
|
||||
name: Switch to dark mode
|
||||
|
||||
# Palette toggle for dark mode
|
||||
- media: "(prefers-color-scheme: dark)"
|
||||
scheme: slate
|
||||
primary: black
|
||||
toggle:
|
||||
icon: material/brightness-4
|
||||
name: Switch to system preference
|
||||
|
||||
logo: assets/logo.png
|
||||
favicon: assets/logo.png
|
||||
features:
|
||||
- content.tabs.link
|
||||
- content.code.annotate
|
||||
- content.code.copy
|
||||
- announce.dismiss
|
||||
- navigation.tabs
|
||||
# - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
|
||||
- navigation.instant
|
||||
- navigation.instant.prefetch
|
||||
# - navigation.instant.preview
|
||||
- navigation.instant.progress
|
||||
- navigation.path
|
||||
- navigation.sections # <=
|
||||
- navigation.top
|
||||
- navigation.tracking
|
||||
- search.suggest
|
||||
- toc.follow
|
||||
nav:
|
||||
- Get started:
|
||||
- Home: index.md
|
||||
- Installation: installation.md
|
||||
# - Docling v2: v2.md
|
||||
# - Concepts:
|
||||
# - Docling Document: concepts/document.md
|
||||
# - Chunking: concepts/chunking.md
|
||||
- Examples:
|
||||
- Conversion:
|
||||
- "Simple conversion": examples/minimal.py
|
||||
- "Custom conversion": examples/custom_convert.py
|
||||
- "Batch conversion": examples/batch_convert.py
|
||||
- "Figure export": examples/export_figures.py
|
||||
- "Table export": examples/export_tables.py
|
||||
- "Multimodal export": examples/export_multimodal.py
|
||||
- RAG / QA:
|
||||
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
||||
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
||||
# - Chunking:
|
||||
# - Chunking: examples/chunking.md
|
||||
# - CLI:
|
||||
# - CLI: examples/cli.md
|
||||
- Integrations:
|
||||
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
|
||||
# - "LangChain 🦜🔗 extension": integrations/langchain.md
|
||||
# - API reference:
|
||||
# - API reference: api_reference/index.md
|
||||
|
||||
markdown_extensions:
|
||||
- pymdownx.superfences
|
||||
- pymdownx.tabbed:
|
||||
alternate_style: true
|
||||
slugify: !!python/object/apply:pymdownx.slugs.slugify
|
||||
kwds:
|
||||
case: lower
|
||||
- admonition
|
||||
- pymdownx.details
|
||||
- attr_list
|
||||
plugins:
|
||||
- search
|
||||
- mkdocs-jupyter
|
||||
|
||||
extra_css:
|
||||
- stylesheets/extra.css
|
77
tests/test_backend_docling_parse_v2.py
Normal file
77
tests/test_backend_docling_parse_v2.py
Normal file
@ -0,0 +1,77 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_v2_backend import (
|
||||
DoclingParseV2DocumentBackend,
|
||||
DoclingParseV2PageBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import BoundingBox, InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_doc_path():
|
||||
return Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
|
||||
def _get_backend(pdf_doc):
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=pdf_doc,
|
||||
format=InputFormat.PDF,
|
||||
backend=DoclingParseV2DocumentBackend,
|
||||
)
|
||||
|
||||
doc_backend = in_doc._backend
|
||||
return doc_backend
|
||||
|
||||
|
||||
@pytest.mark.skip
|
||||
def test_text_cell_counts():
|
||||
pdf_doc = Path("./tests/data/redp5695.pdf")
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||
cells = list(page_backend.get_text_cells())
|
||||
|
||||
if last_cell_count is None:
|
||||
last_cell_count = len(cells)
|
||||
|
||||
if len(cells) != last_cell_count:
|
||||
assert (
|
||||
False
|
||||
), "Loading page multiple times yielded non-identical text cell counts"
|
||||
last_cell_count = len(cells)
|
||||
|
||||
|
||||
def test_get_text_from_rect(test_doc_path):
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Get the title text of the DocLayNet paper
|
||||
textpiece = page_backend.get_text_in_rect(
|
||||
bbox=BoundingBox(l=102, t=77, r=511, b=124)
|
||||
)
|
||||
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
|
||||
|
||||
assert textpiece.strip() == ref
|
||||
|
||||
|
||||
def test_crop_page_image(test_doc_path):
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(
|
||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||
)
|
||||
# im.show()
|
||||
|
||||
|
||||
def test_num_pages(test_doc_path):
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
doc_backend.page_count() == 9
|
Loading…
Reference in New Issue
Block a user