mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
Merge branch 'main' of github.com:DS4SD/docling into cau/picture-content-example
This commit is contained in:
commit
15989718b7
5
.github/workflows/docs.yml
vendored
5
.github/workflows/docs.yml
vendored
@ -14,7 +14,10 @@ jobs:
|
|||||||
- uses: ./.github/actions/setup-poetry
|
- uses: ./.github/actions/setup-poetry
|
||||||
- name: Build docs
|
- name: Build docs
|
||||||
run: poetry run mkdocs build --verbose --clean
|
run: poetry run mkdocs build --verbose --clean
|
||||||
|
- name: Make docs LLM ready
|
||||||
|
if: inputs.deploy
|
||||||
|
uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b
|
||||||
- name: Build and push docs
|
- name: Build and push docs
|
||||||
if: inputs.deploy
|
if: inputs.deploy
|
||||||
run: poetry run mkdocs gh-deploy --force
|
run: poetry run mkdocs gh-deploy --force --dirty
|
||||||
|
|
33
CHANGELOG.md
33
CHANGELOG.md
@ -1,3 +1,36 @@
|
|||||||
|
## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Improve OCR results, stricten criteria before dropping bitmap areas ([#719](https://github.com/DS4SD/docling/issues/719)) ([`5a060f2`](https://github.com/DS4SD/docling/commit/5a060f237d1decd0ff9db9e73478978419315778))
|
||||||
|
* Allow earlier requests versions ([#716](https://github.com/DS4SD/docling/issues/716)) ([`e64b5a2`](https://github.com/DS4SD/docling/commit/e64b5a2f628acc340a6d94ee6f1ada2aa267cecc))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Add pointers to LangChain-side docs ([#718](https://github.com/DS4SD/docling/issues/718)) ([`9a6b5c8`](https://github.com/DS4SD/docling/commit/9a6b5c8c8debc81e0ddcbe91df6afbbeb29e97e6))
|
||||||
|
* Add LangChain docs ([#717](https://github.com/DS4SD/docling/issues/717)) ([`4fa8028`](https://github.com/DS4SD/docling/commit/4fa8028bd8120d7557e1d45ba31e200e130af698))
|
||||||
|
|
||||||
|
## [v2.15.0](https://github.com/DS4SD/docling/releases/tag/v2.15.0) - 2025-01-08
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Added http header support for document converter and cli ([#642](https://github.com/DS4SD/docling/issues/642)) ([`0ee849e`](https://github.com/DS4SD/docling/commit/0ee849e8bc8cf24d1c5597af3fe20a7fa19a29e0))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Correct scaling of debug visualizations, tune OCR ([#700](https://github.com/DS4SD/docling/issues/700)) ([`5cb4cf6`](https://github.com/DS4SD/docling/commit/5cb4cf6f19f91e6c87141e93400c4b54b93aa5d7))
|
||||||
|
* Let BeautifulSoup detect the HTML encoding ([#695](https://github.com/DS4SD/docling/issues/695)) ([`42856fd`](https://github.com/DS4SD/docling/commit/42856fdf79559188ec4617bc5d3a007286f114d2))
|
||||||
|
* **mspowerpoint:** Handle invalid images in PowerPoint slides ([#650](https://github.com/DS4SD/docling/issues/650)) ([`d49650c`](https://github.com/DS4SD/docling/commit/d49650c54ffa60bc6d6106970e104071689bc7b0))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Specify docstring types ([#702](https://github.com/DS4SD/docling/issues/702)) ([`ead396a`](https://github.com/DS4SD/docling/commit/ead396ab407f6bbd43176abd6ed2bed7ed8c7c43))
|
||||||
|
* Add link to rag with granite ([#698](https://github.com/DS4SD/docling/issues/698)) ([`6701f34`](https://github.com/DS4SD/docling/commit/6701f34c855992c52918b210c65a2edb1c827c01))
|
||||||
|
* Add integrations, revamp docs ([#693](https://github.com/DS4SD/docling/issues/693)) ([`2d24fae`](https://github.com/DS4SD/docling/commit/2d24faecd96bfa656b2b8c80f25cdf251a50526a))
|
||||||
|
* Add OpenContracts as an integration ([#679](https://github.com/DS4SD/docling/issues/679)) ([`569038d`](https://github.com/DS4SD/docling/commit/569038df4205703f87517ea58da7902d143e7699))
|
||||||
|
* Add Weaviate RAG recipe notebook ([#451](https://github.com/DS4SD/docling/issues/451)) ([`2b591f9`](https://github.com/DS4SD/docling/commit/2b591f98726ed0d883236dd0550201b95203eebb))
|
||||||
|
* Document Haystack & Vectara support ([#628](https://github.com/DS4SD/docling/issues/628)) ([`fc645ea`](https://github.com/DS4SD/docling/commit/fc645ea531ddc67959640b428007851d641c923e))
|
||||||
|
|
||||||
## [v2.14.0](https://github.com/DS4SD/docling/releases/tag/v2.14.0) - 2024-12-18
|
## [v2.14.0](https://github.com/DS4SD/docling/releases/tag/v2.14.0) - 2024-12-18
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|||||||
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
||||||
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
||||||
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
||||||
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
||||||
* 🔍 OCR support for scanned PDFs
|
* 🔍 OCR support for scanned PDFs
|
||||||
* 💻 Simple and convenient CLI
|
* 💻 Simple and convenient CLI
|
||||||
|
|
||||||
@ -39,7 +39,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
|
|||||||
|
|
||||||
* ♾️ Equation & code extraction
|
* ♾️ Equation & code extraction
|
||||||
* 📝 Metadata extraction, including title, authors, references & language
|
* 📝 Metadata extraction, including title, authors, references & language
|
||||||
* 🦜🔗 Native LangChain extension
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
return cells
|
return cells
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 32 * 32
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
|
|
||||||
for i in range(len(self._dpage["images"])):
|
for i in range(len(self._dpage["images"])):
|
||||||
bitmap = self._dpage["images"][i]
|
bitmap = self._dpage["images"][i]
|
||||||
@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
||||||
padbox.r = page_size.width - padbox.r
|
padbox.r = page_size.width - padbox.r
|
||||||
padbox.t = page_size.height - padbox.t
|
padbox.t = page_size.height - padbox.t
|
||||||
|
|
||||||
|
@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
return cells
|
return cells
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 32 * 32
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
|
|
||||||
images = self._dpage["sanitized"]["images"]["data"]
|
images = self._dpage["sanitized"]["images"]["data"]
|
||||||
images_header = self._dpage["sanitized"]["images"]["header"]
|
images_header = self._dpage["sanitized"]["images"]["header"]
|
||||||
@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
||||||
padbox.r = page_size.width - padbox.r
|
padbox.r = page_size.width - padbox.r
|
||||||
padbox.t = page_size.height - padbox.t
|
padbox.t = page_size.height - padbox.t
|
||||||
|
|
||||||
|
@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue()
|
||||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, "rb") as f:
|
||||||
html_content = f.read()
|
html_content = f.read()
|
||||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -16,7 +16,7 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from PIL import Image
|
from PIL import Image, UnidentifiedImageError
|
||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||||
|
|
||||||
@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
bullet_type = "None"
|
bullet_type = "None"
|
||||||
list_text = ""
|
list_text = ""
|
||||||
list_label = GroupLabel.LIST
|
list_label = GroupLabel.LIST
|
||||||
|
doc_label = DocItemLabel.LIST_ITEM
|
||||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
||||||
|
|
||||||
# Identify if shape contains lists
|
# Identify if shape contains lists
|
||||||
@ -276,6 +277,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
im_dpi, _ = image.dpi
|
im_dpi, _ = image.dpi
|
||||||
|
|
||||||
# Open it with PIL
|
# Open it with PIL
|
||||||
|
try:
|
||||||
pil_image = Image.open(BytesIO(image_bytes))
|
pil_image = Image.open(BytesIO(image_bytes))
|
||||||
|
|
||||||
# shape has picture
|
# shape has picture
|
||||||
@ -286,6 +288,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
caption=None,
|
caption=None,
|
||||||
prov=prov,
|
prov=prov,
|
||||||
)
|
)
|
||||||
|
except (UnidentifiedImageError, OSError) as e:
|
||||||
|
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
||||||
|
@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
return self.valid
|
return self.valid
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 32 * 32
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||||
pos = obj.get_pos()
|
pos = obj.get_pos()
|
||||||
cropbox = BoundingBox.from_tuple(
|
cropbox = BoundingBox.from_tuple(
|
||||||
@ -210,7 +210,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
||||||
padbox.r = page_size.width - padbox.r
|
padbox.r = page_size.width - padbox.r
|
||||||
padbox.t = page_size.height - padbox.t
|
padbox.t = page_size.height - padbox.t
|
||||||
|
|
||||||
|
@ -164,6 +164,11 @@ def convert(
|
|||||||
to_formats: List[OutputFormat] = typer.Option(
|
to_formats: List[OutputFormat] = typer.Option(
|
||||||
None, "--to", help="Specify output formats. Defaults to Markdown."
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
||||||
),
|
),
|
||||||
|
headers: str = typer.Option(
|
||||||
|
None,
|
||||||
|
"--headers",
|
||||||
|
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
|
||||||
|
),
|
||||||
image_export_mode: Annotated[
|
image_export_mode: Annotated[
|
||||||
ImageRefMode,
|
ImageRefMode,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -279,12 +284,19 @@ def convert(
|
|||||||
if from_formats is None:
|
if from_formats is None:
|
||||||
from_formats = [e for e in InputFormat]
|
from_formats = [e for e in InputFormat]
|
||||||
|
|
||||||
|
parsed_headers: Optional[Dict[str, str]] = None
|
||||||
|
if headers is not None:
|
||||||
|
headers_t = TypeAdapter(Dict[str, str])
|
||||||
|
parsed_headers = headers_t.validate_json(headers)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tempdir:
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
input_doc_paths: List[Path] = []
|
input_doc_paths: List[Path] = []
|
||||||
for src in input_sources:
|
for src in input_sources:
|
||||||
try:
|
try:
|
||||||
# check if we can fetch some remote url
|
# check if we can fetch some remote url
|
||||||
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
source = resolve_source_to_path(
|
||||||
|
source=src, headers=parsed_headers, workdir=Path(tempdir)
|
||||||
|
)
|
||||||
input_doc_paths.append(source)
|
input_doc_paths.append(source)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
err_console.print(
|
err_console.print(
|
||||||
@ -390,7 +402,7 @@ def convert(
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(
|
conv_results = doc_converter.convert_all(
|
||||||
input_doc_paths, raises_on_error=abort_on_error
|
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
||||||
)
|
)
|
||||||
|
|
||||||
output.mkdir(parents=True, exist_ok=True)
|
output.mkdir(parents=True, exist_ok=True)
|
||||||
|
@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
|||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
|
NodeItem,
|
||||||
PictureDataType,
|
PictureDataType,
|
||||||
Size,
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
@ -201,6 +202,13 @@ class AssembledUnit(BaseModel):
|
|||||||
headers: List[PageElement] = []
|
headers: List[PageElement] = []
|
||||||
|
|
||||||
|
|
||||||
|
class ItemAndImageEnrichmentElement(BaseModel):
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
item: NodeItem
|
||||||
|
image: Image
|
||||||
|
|
||||||
|
|
||||||
class Page(BaseModel):
|
class Page(BaseModel):
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
@ -219,12 +227,28 @@ class Page(BaseModel):
|
|||||||
{}
|
{}
|
||||||
) # Cache of images in different scales. By default it is cleared during assembling.
|
) # Cache of images in different scales. By default it is cleared during assembling.
|
||||||
|
|
||||||
def get_image(self, scale: float = 1.0) -> Optional[Image]:
|
def get_image(
|
||||||
|
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
||||||
|
) -> Optional[Image]:
|
||||||
if self._backend is None:
|
if self._backend is None:
|
||||||
return self._image_cache.get(scale, None)
|
return self._image_cache.get(scale, None)
|
||||||
|
|
||||||
if not scale in self._image_cache:
|
if not scale in self._image_cache:
|
||||||
|
if cropbox is None:
|
||||||
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
||||||
|
else:
|
||||||
|
return self._backend.get_page_image(scale=scale, cropbox=cropbox)
|
||||||
|
|
||||||
|
if cropbox is None:
|
||||||
return self._image_cache[scale]
|
return self._image_cache[scale]
|
||||||
|
else:
|
||||||
|
page_im = self._image_cache[scale]
|
||||||
|
assert self.size is not None
|
||||||
|
return page_im.crop(
|
||||||
|
cropbox.to_top_left_origin(page_height=self.size.height)
|
||||||
|
.scaled(scale=scale)
|
||||||
|
.as_tuple()
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def image(self) -> Optional[Image]:
|
def image(self) -> Optional[Image]:
|
||||||
|
@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
|
|||||||
class _DocumentConversionInput(BaseModel):
|
class _DocumentConversionInput(BaseModel):
|
||||||
|
|
||||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||||
|
headers: Optional[Dict[str, str]] = None
|
||||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||||
|
|
||||||
def docs(
|
def docs(
|
||||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
for item in self.path_or_stream_iterator:
|
for item in self.path_or_stream_iterator:
|
||||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
obj = (
|
||||||
|
resolve_source_to_stream(item, self.headers)
|
||||||
|
if isinstance(item, str)
|
||||||
|
else item
|
||||||
|
)
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
backend: Type[AbstractDocumentBackend]
|
backend: Type[AbstractDocumentBackend]
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
|
@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
|
|||||||
|
|
||||||
use_gpu: Optional[bool] = None
|
use_gpu: Optional[bool] = None
|
||||||
|
|
||||||
confidence_threshold: float = 0.65
|
confidence_threshold: float = 0.5
|
||||||
|
|
||||||
model_storage_directory: Optional[str] = None
|
model_storage_directory: Optional[str] = None
|
||||||
recog_network: Optional[str] = "standard"
|
recog_network: Optional[str] = "standard"
|
||||||
|
@ -176,6 +176,7 @@ class DocumentConverter:
|
|||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
source: Union[Path, str, DocumentStream], # TODO review naming
|
source: Union[Path, str, DocumentStream], # TODO review naming
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
raises_on_error: bool = True,
|
raises_on_error: bool = True,
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
max_file_size: int = sys.maxsize,
|
max_file_size: int = sys.maxsize,
|
||||||
@ -185,6 +186,7 @@ class DocumentConverter:
|
|||||||
raises_on_error=raises_on_error,
|
raises_on_error=raises_on_error,
|
||||||
max_num_pages=max_num_pages,
|
max_num_pages=max_num_pages,
|
||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
|
headers=headers,
|
||||||
)
|
)
|
||||||
return next(all_res)
|
return next(all_res)
|
||||||
|
|
||||||
@ -192,6 +194,7 @@ class DocumentConverter:
|
|||||||
def convert_all(
|
def convert_all(
|
||||||
self,
|
self,
|
||||||
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
max_file_size: int = sys.maxsize,
|
max_file_size: int = sys.maxsize,
|
||||||
@ -201,8 +204,7 @@ class DocumentConverter:
|
|||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
)
|
)
|
||||||
conv_input = _DocumentConversionInput(
|
conv_input = _DocumentConversionInput(
|
||||||
path_or_stream_iterator=source,
|
path_or_stream_iterator=source, limits=limits, headers=headers
|
||||||
limits=limits,
|
|
||||||
)
|
)
|
||||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||||
|
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Iterable
|
from typing import Any, Generic, Iterable, Optional
|
||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import DoclingDocument, NodeItem, TextItem
|
||||||
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
|
|
||||||
|
|
||||||
@ -15,14 +16,54 @@ class BasePageModel(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BaseEnrichmentModel(ABC):
|
EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
||||||
|
|
||||||
|
|
||||||
|
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __call__(
|
def prepare_element(
|
||||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
self, conv_res: ConversionResult, element: NodeItem
|
||||||
) -> Iterable[Any]:
|
) -> Optional[EnrichElementT]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def __call__(
|
||||||
|
self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
|
||||||
|
) -> Iterable[NodeItem]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
||||||
|
|
||||||
|
def prepare_element(
|
||||||
|
self, conv_res: ConversionResult, element: NodeItem
|
||||||
|
) -> Optional[NodeItem]:
|
||||||
|
if self.is_processable(doc=conv_res.document, element=element):
|
||||||
|
return element
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class BaseItemAndImageEnrichmentModel(
|
||||||
|
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
||||||
|
):
|
||||||
|
|
||||||
|
images_scale: float
|
||||||
|
|
||||||
|
def prepare_element(
|
||||||
|
self, conv_res: ConversionResult, element: NodeItem
|
||||||
|
) -> Optional[ItemAndImageEnrichmentElement]:
|
||||||
|
if not self.is_processable(doc=conv_res.document, element=element):
|
||||||
|
return None
|
||||||
|
|
||||||
|
assert isinstance(element, TextItem)
|
||||||
|
element_prov = element.prov[0]
|
||||||
|
page_ix = element_prov.page_no - 1
|
||||||
|
cropped_image = conv_res.pages[page_ix].get_image(
|
||||||
|
scale=self.images_scale, cropbox=element_prov.bbox
|
||||||
|
)
|
||||||
|
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
||||||
|
@ -8,7 +8,7 @@ import numpy as np
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import find_objects, label
|
from scipy.ndimage import binary_dilation, find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):
|
|||||||
|
|
||||||
np_image = np.array(image)
|
np_image = np.array(image)
|
||||||
|
|
||||||
|
# Dilate the image by 10 pixels to merge nearby bitmap rectangles
|
||||||
|
structure = np.ones(
|
||||||
|
(20, 20)
|
||||||
|
) # Create a 20x20 structure element (10 pixels in all directions)
|
||||||
|
np_image = binary_dilation(np_image > 0, structure=structure)
|
||||||
|
|
||||||
# Find the connected components
|
# Find the connected components
|
||||||
labeled_image, num_features = label(
|
labeled_image, num_features = label(
|
||||||
np_image > 0
|
np_image > 0
|
||||||
@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
|
|||||||
bitmap_rects = []
|
bitmap_rects = []
|
||||||
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
||||||
|
|
||||||
# return full-page rectangle if sufficiently covered with bitmaps
|
# return full-page rectangle if page is dominantly covered with bitmaps
|
||||||
if self.options.force_full_page_ocr or coverage > max(
|
if self.options.force_full_page_ocr or coverage > max(
|
||||||
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
||||||
):
|
):
|
||||||
@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
|
|||||||
coord_origin=CoordOrigin.TOPLEFT,
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
# return individual rectangles if the bitmap coverage is smaller
|
# return individual rectangles if the bitmap coverage is above the threshold
|
||||||
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
|
elif coverage > self.options.bitmap_area_threshold:
|
||||||
|
|
||||||
# skip OCR if the bitmap area on the page is smaller than the options threshold
|
|
||||||
ocr_rects = [
|
|
||||||
rect
|
|
||||||
for rect in ocr_rects
|
|
||||||
if rect.area() / (page.size.width * page.size.height)
|
|
||||||
> self.options.bitmap_area_threshold
|
|
||||||
]
|
|
||||||
return ocr_rects
|
return ocr_rects
|
||||||
|
else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
|
||||||
|
return []
|
||||||
|
|
||||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||||
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||||
@ -138,18 +138,34 @@ class BaseOcrModel(BasePageModel):
|
|||||||
|
|
||||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||||
image = copy.deepcopy(page.image)
|
image = copy.deepcopy(page.image)
|
||||||
|
scale_x = image.width / page.size.width
|
||||||
|
scale_y = image.height / page.size.height
|
||||||
|
|
||||||
draw = ImageDraw.Draw(image, "RGBA")
|
draw = ImageDraw.Draw(image, "RGBA")
|
||||||
|
|
||||||
# Draw OCR rectangles as yellow filled rect
|
# Draw OCR rectangles as yellow filled rect
|
||||||
for rect in ocr_rects:
|
for rect in ocr_rects:
|
||||||
x0, y0, x1, y1 = rect.as_tuple()
|
x0, y0, x1, y1 = rect.as_tuple()
|
||||||
|
y0 *= scale_x
|
||||||
|
y1 *= scale_y
|
||||||
|
x0 *= scale_x
|
||||||
|
x1 *= scale_x
|
||||||
|
|
||||||
shade_color = (255, 255, 0, 40) # transparent yellow
|
shade_color = (255, 255, 0, 40) # transparent yellow
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
||||||
|
|
||||||
# Draw OCR and programmatic cells
|
# Draw OCR and programmatic cells
|
||||||
for tc in page.cells:
|
for tc in page.cells:
|
||||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||||
color = "red"
|
y0 *= scale_x
|
||||||
|
y1 *= scale_y
|
||||||
|
x0 *= scale_x
|
||||||
|
x1 *= scale_x
|
||||||
|
|
||||||
|
if y1 <= y0:
|
||||||
|
y1, y0 = y0, y1
|
||||||
|
|
||||||
|
color = "gray"
|
||||||
if isinstance(tc, OcrCell):
|
if isinstance(tc, OcrCell):
|
||||||
color = "magenta"
|
color = "magenta"
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
||||||
|
@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
|
|||||||
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
||||||
Includes label names and confidence scores for each cluster.
|
Includes label names and confidence scores for each cluster.
|
||||||
"""
|
"""
|
||||||
label_to_color = {
|
scale_x = page.image.width / page.size.width
|
||||||
DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
|
scale_y = page.image.height / page.size.height
|
||||||
DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
|
|
||||||
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
|
|
||||||
DocItemLabel.FORMULA: (192, 192, 192), # Gray
|
|
||||||
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
|
|
||||||
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
|
|
||||||
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
|
|
||||||
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
|
|
||||||
DocItemLabel.PAGE_FOOTER: (
|
|
||||||
204,
|
|
||||||
255,
|
|
||||||
204,
|
|
||||||
), # Light Green (same as Page-Header)
|
|
||||||
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
|
|
||||||
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
|
|
||||||
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
|
|
||||||
DocItemLabel.CODE: (125, 125, 125), # Gray
|
|
||||||
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
|
|
||||||
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
|
|
||||||
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
|
|
||||||
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
|
|
||||||
}
|
|
||||||
# Filter clusters for left and right images
|
# Filter clusters for left and right images
|
||||||
exclude_labels = {
|
exclude_labels = {
|
||||||
DocItemLabel.FORM,
|
DocItemLabel.FORM,
|
||||||
@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
|
|||||||
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
||||||
for tc in c.cells:
|
for tc in c.cells:
|
||||||
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
||||||
|
cx0 *= scale_x
|
||||||
|
cx1 *= scale_x
|
||||||
|
cy0 *= scale_x
|
||||||
|
cy1 *= scale_y
|
||||||
|
|
||||||
draw.rectangle(
|
draw.rectangle(
|
||||||
[(cx0, cy0), (cx1, cy1)],
|
[(cx0, cy0), (cx1, cy1)],
|
||||||
outline=None,
|
outline=None,
|
||||||
@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
|
|||||||
)
|
)
|
||||||
# Draw cluster rectangle
|
# Draw cluster rectangle
|
||||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||||
cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
|
x0 *= scale_x
|
||||||
cluster_outline_color = (*list(label_to_color.get(c.label)), 255)
|
x1 *= scale_x
|
||||||
|
y0 *= scale_x
|
||||||
|
y1 *= scale_y
|
||||||
|
|
||||||
|
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
||||||
|
cluster_outline_color = (
|
||||||
|
*list(DocItemLabel.get_color(c.label)),
|
||||||
|
255,
|
||||||
|
)
|
||||||
draw.rectangle(
|
draw.rectangle(
|
||||||
[(x0, y0), (x1, y1)],
|
[(x0, y0), (x1, y1)],
|
||||||
outline=cluster_outline_color,
|
outline=cluster_outline_color,
|
||||||
|
@ -22,7 +22,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class PageAssembleOptions(BaseModel):
|
class PageAssembleOptions(BaseModel):
|
||||||
keep_images: bool = False
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PageAssembleModel(BasePageModel):
|
class PageAssembleModel(BasePageModel):
|
||||||
@ -174,11 +174,4 @@ class PageAssembleModel(BasePageModel):
|
|||||||
elements=elements, headers=headers, body=body
|
elements=elements, headers=headers, body=body
|
||||||
)
|
)
|
||||||
|
|
||||||
# Remove page images (can be disabled)
|
|
||||||
if not self.options.keep_images:
|
|
||||||
page._image_cache = {}
|
|
||||||
|
|
||||||
# Unload backend
|
|
||||||
page._backend.unload()
|
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
|
|||||||
show: bool = False,
|
show: bool = False,
|
||||||
):
|
):
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
|
assert page.size is not None
|
||||||
|
|
||||||
image = (
|
image = (
|
||||||
page._backend.get_page_image()
|
page._backend.get_page_image()
|
||||||
) # make new image to avoid drawing on the saved ones
|
) # make new image to avoid drawing on the saved ones
|
||||||
|
|
||||||
|
scale_x = image.width / page.size.width
|
||||||
|
scale_y = image.height / page.size.height
|
||||||
|
|
||||||
draw = ImageDraw.Draw(image)
|
draw = ImageDraw.Draw(image)
|
||||||
|
|
||||||
for table_element in tbl_list:
|
for table_element in tbl_list:
|
||||||
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
|
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
|
||||||
|
y0 *= scale_x
|
||||||
|
y1 *= scale_y
|
||||||
|
x0 *= scale_x
|
||||||
|
x1 *= scale_x
|
||||||
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||||
|
|
||||||
for cell in table_element.cluster.cells:
|
for cell in table_element.cluster.cells:
|
||||||
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
||||||
|
x0 *= scale_x
|
||||||
|
x1 *= scale_x
|
||||||
|
y0 *= scale_x
|
||||||
|
y1 *= scale_y
|
||||||
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
||||||
|
|
||||||
for tc in table_element.table_cells:
|
for tc in table_element.table_cells:
|
||||||
if tc.bbox is not None:
|
if tc.bbox is not None:
|
||||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||||
|
x0 *= scale_x
|
||||||
|
x1 *= scale_x
|
||||||
|
y0 *= scale_x
|
||||||
|
y1 *= scale_y
|
||||||
|
|
||||||
if tc.column_header:
|
if tc.column_header:
|
||||||
width = 3
|
width = 3
|
||||||
else:
|
else:
|
||||||
|
@ -28,6 +28,7 @@ _log = logging.getLogger(__name__)
|
|||||||
class BasePipeline(ABC):
|
class BasePipeline(ABC):
|
||||||
def __init__(self, pipeline_options: PipelineOptions):
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
self.pipeline_options = pipeline_options
|
self.pipeline_options = pipeline_options
|
||||||
|
self.keep_images = False
|
||||||
self.build_pipe: List[Callable] = []
|
self.build_pipe: List[Callable] = []
|
||||||
self.enrichment_pipe: List[BaseEnrichmentModel] = []
|
self.enrichment_pipe: List[BaseEnrichmentModel] = []
|
||||||
|
|
||||||
@ -40,7 +41,7 @@ class BasePipeline(ABC):
|
|||||||
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
||||||
):
|
):
|
||||||
# These steps are building and assembling the structure of the
|
# These steps are building and assembling the structure of the
|
||||||
# output DoclingDocument
|
# output DoclingDocument.
|
||||||
conv_res = self._build_document(conv_res)
|
conv_res = self._build_document(conv_res)
|
||||||
conv_res = self._assemble_document(conv_res)
|
conv_res = self._assemble_document(conv_res)
|
||||||
# From this stage, all operations should rely only on conv_res.output
|
# From this stage, all operations should rely only on conv_res.output
|
||||||
@ -50,6 +51,8 @@ class BasePipeline(ABC):
|
|||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
raise e
|
raise e
|
||||||
|
finally:
|
||||||
|
self._unload(conv_res)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@ -62,21 +65,22 @@ class BasePipeline(ABC):
|
|||||||
|
|
||||||
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
|
||||||
def _filter_elements(
|
def _prepare_elements(
|
||||||
doc: DoclingDocument, model: BaseEnrichmentModel
|
conv_res: ConversionResult, model: BaseEnrichmentModel
|
||||||
) -> Iterable[NodeItem]:
|
) -> Iterable[NodeItem]:
|
||||||
for element, _level in doc.iterate_items():
|
for doc_element, _level in conv_res.document.iterate_items():
|
||||||
if model.is_processable(doc=doc, element=element):
|
prepared_element = model.prepare_element(
|
||||||
yield element
|
conv_res=conv_res, element=doc_element
|
||||||
|
)
|
||||||
|
if prepared_element is not None:
|
||||||
|
yield prepared_element
|
||||||
|
|
||||||
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
||||||
for model in self.enrichment_pipe:
|
for model in self.enrichment_pipe:
|
||||||
for element_batch in chunkify(
|
for element_batch in chunkify(
|
||||||
_filter_elements(conv_res.document, model),
|
_prepare_elements(conv_res, model),
|
||||||
settings.perf.elements_batch_size,
|
settings.perf.elements_batch_size,
|
||||||
):
|
):
|
||||||
# TODO: currently we assume the element itself is modified, because
|
|
||||||
# we don't have an interface to save the element back to the document
|
|
||||||
for element in model(
|
for element in model(
|
||||||
doc=conv_res.document, element_batch=element_batch
|
doc=conv_res.document, element_batch=element_batch
|
||||||
): # Must exhaust!
|
): # Must exhaust!
|
||||||
@ -88,6 +92,9 @@ class BasePipeline(ABC):
|
|||||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _unload(self, conv_res: ConversionResult):
|
||||||
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_default_options(cls) -> PipelineOptions:
|
def get_default_options(cls) -> PipelineOptions:
|
||||||
@ -107,6 +114,10 @@ class BasePipeline(ABC):
|
|||||||
|
|
||||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
self.keep_backend = False
|
||||||
|
|
||||||
def _apply_on_pages(
|
def _apply_on_pages(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
@ -148,7 +159,14 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
||||||
|
|
||||||
for p in pipeline_pages: # Must exhaust!
|
for p in pipeline_pages: # Must exhaust!
|
||||||
pass
|
|
||||||
|
# Cleanup cached images
|
||||||
|
if not self.keep_images:
|
||||||
|
p._image_cache = {}
|
||||||
|
|
||||||
|
# Cleanup page backends
|
||||||
|
if not self.keep_backend and p._backend is not None:
|
||||||
|
p._backend.unload()
|
||||||
|
|
||||||
end_batch_time = time.monotonic()
|
end_batch_time = time.monotonic()
|
||||||
total_elapsed_time += end_batch_time - start_batch_time
|
total_elapsed_time += end_batch_time - start_batch_time
|
||||||
@ -177,8 +195,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
finally:
|
return conv_res
|
||||||
# Always unload the PDF backend, even in case of failure
|
|
||||||
|
def _unload(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
for page in conv_res.pages:
|
||||||
|
if page._backend is not None:
|
||||||
|
page._backend.unload()
|
||||||
|
|
||||||
if conv_res.input._backend:
|
if conv_res.input._backend:
|
||||||
conv_res.input._backend.unload()
|
conv_res.input._backend.unload()
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
@ -17,6 +17,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
@ -50,7 +51,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
else:
|
else:
|
||||||
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
||||||
|
|
||||||
keep_images = (
|
self.keep_images = (
|
||||||
self.pipeline_options.generate_page_images
|
self.pipeline_options.generate_page_images
|
||||||
or self.pipeline_options.generate_picture_images
|
or self.pipeline_options.generate_picture_images
|
||||||
or self.pipeline_options.generate_table_images
|
or self.pipeline_options.generate_table_images
|
||||||
@ -87,7 +88,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
),
|
),
|
||||||
# Page assemble
|
# Page assemble
|
||||||
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
PageAssembleModel(options=PageAssembleOptions()),
|
||||||
]
|
]
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
|
@ -54,12 +54,12 @@ tokens), &
|
|||||||
chunks with same headings & captions) — users can opt out of this step via param
|
chunks with same headings & captions) — users can opt out of this step via param
|
||||||
`merge_peers` (by default `True`)
|
`merge_peers` (by default `True`)
|
||||||
|
|
||||||
👉 Example: see [here](../../examples/hybrid_chunking).
|
👉 Example: see [here](../examples/hybrid_chunking.ipynb).
|
||||||
|
|
||||||
## Hierarchical Chunker
|
## Hierarchical Chunker
|
||||||
|
|
||||||
The `HierarchicalChunker` implementation uses the document structure information from
|
The `HierarchicalChunker` implementation uses the document structure information from
|
||||||
the [`DoclingDocument`](../docling_document) to create one chunk for each individual
|
the [`DoclingDocument`](./docling_document.md) to create one chunk for each individual
|
||||||
detected document element, by default only merging together list items (can be opted out
|
detected document element, by default only merging together list items (can be opted out
|
||||||
via param `merge_list_items`). It also takes care of attaching all relevant document
|
via param `merge_list_items`). It also takes care of attaching all relevant document
|
||||||
metadata, including headers and captions.
|
metadata, including headers and captions.
|
||||||
|
@ -5,7 +5,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorDevice,
|
||||||
|
AcceleratorOptions,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.models.ocr_mac_model import OcrMacOptions
|
from docling.models.ocr_mac_model import OcrMacOptions
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||||
@ -76,7 +80,7 @@ def main():
|
|||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
pipeline_options.ocr_options.lang = ["es"]
|
pipeline_options.ocr_options.lang = ["es"]
|
||||||
pipeline_options.accelerator_options = AcceleratorOptions(
|
pipeline_options.accelerator_options = AcceleratorOptions(
|
||||||
num_threads=4, device=Device.AUTO
|
num_threads=4, device=AcceleratorDevice.AUTO
|
||||||
)
|
)
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
|
88
docs/examples/develop_formula_understanding.py
Normal file
88
docs/examples/develop_formula_understanding.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat, ItemAndImageEnrichmentElement
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
||||||
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
|
|
||||||
|
class ExampleFormulaUnderstandingPipelineOptions(PdfPipelineOptions):
|
||||||
|
do_formula_understanding: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
# A new enrichment model using both the document element and its image as input
|
||||||
|
class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel):
|
||||||
|
images_scale = 2.6
|
||||||
|
|
||||||
|
def __init__(self, enabled: bool):
|
||||||
|
self.enabled = enabled
|
||||||
|
|
||||||
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
|
return (
|
||||||
|
self.enabled
|
||||||
|
and isinstance(element, TextItem)
|
||||||
|
and element.label == DocItemLabel.FORMULA
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
||||||
|
) -> Iterable[NodeItem]:
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
for enrich_element in element_batch:
|
||||||
|
enrich_element.image.show()
|
||||||
|
|
||||||
|
yield enrich_element.item
|
||||||
|
|
||||||
|
|
||||||
|
# How the pipeline can be extended.
|
||||||
|
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
|
||||||
|
|
||||||
|
self.enrichment_pipe = [
|
||||||
|
ExampleFormulaUnderstandingEnrichmentModel(
|
||||||
|
enabled=self.pipeline_options.do_formula_understanding
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
if self.pipeline_options.do_formula_understanding:
|
||||||
|
self.keep_backend = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls) -> ExampleFormulaUnderstandingPipelineOptions:
|
||||||
|
return ExampleFormulaUnderstandingPipelineOptions()
|
||||||
|
|
||||||
|
|
||||||
|
# Example main. In the final version, we simply have to set do_formula_understanding to true.
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
input_doc_path = Path("./tests/data/2203.01017v2.pdf")
|
||||||
|
|
||||||
|
pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
|
||||||
|
pipeline_options.do_formula_understanding = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=ExampleFormulaUnderstandingPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
result = doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -4,7 +4,30 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Hybrid Chunking"
|
"# Hybrid chunking"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Overview"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Hybrid chunking applies tokenization-aware refinements on top of document-based hierarchical chunking.\n",
|
||||||
|
"\n",
|
||||||
|
"For more details, see [here](../../concepts/chunking#hybrid-chunker)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setup"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -21,7 +44,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install -qU 'docling-core[chunking]' sentence-transformers transformers lancedb"
|
"%pip install -qU docling transformers"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -48,16 +71,12 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Chunking"
|
"## Chunking\n",
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Notice how `tokenizer` and `embed_model` further below are single-sourced from `EMBED_MODEL_ID`.\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"This is important for making sure the chunker and the embedding model are using the same tokenizer."
|
"### Basic usage\n",
|
||||||
|
"\n",
|
||||||
|
"For a basic usage scenario, we can just instantiate a `HybridChunker`, which will use\n",
|
||||||
|
"the default parameters."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -65,20 +84,102 @@
|
|||||||
"execution_count": 3,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from docling.chunking import HybridChunker\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HybridChunker()\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note that the text you would typically want to embed is the context-enriched one as\n",
|
||||||
|
"returned by the `serialize()` method:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"=== 0 ===\n",
|
||||||
|
"chunk.text:\n",
|
||||||
|
"'International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Aver…'\n",
|
||||||
|
"chunker.serialize(chunk):\n",
|
||||||
|
"'IBM\\nInternational Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial …'\n",
|
||||||
|
"\n",
|
||||||
|
"=== 1 ===\n",
|
||||||
|
"chunk.text:\n",
|
||||||
|
"'IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889);[19] and Willa…'\n",
|
||||||
|
"chunker.serialize(chunk):\n",
|
||||||
|
"'IBM\\n1910s–1950s\\nIBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889…'\n",
|
||||||
|
"\n",
|
||||||
|
"=== 2 ===\n",
|
||||||
|
"chunk.text:\n",
|
||||||
|
"'Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson,…'\n",
|
||||||
|
"chunker.serialize(chunk):\n",
|
||||||
|
"'IBM\\n1910s–1950s\\nCollectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John …'\n",
|
||||||
|
"\n",
|
||||||
|
"=== 3 ===\n",
|
||||||
|
"chunk.text:\n",
|
||||||
|
"'In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.…'\n",
|
||||||
|
"chunker.serialize(chunk):\n",
|
||||||
|
"'IBM\\n1960s–1980s\\nIn 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.…'\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for i, chunk in enumerate(chunk_iter):\n",
|
||||||
|
" print(f\"=== {i} ===\")\n",
|
||||||
|
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
|
||||||
|
"\n",
|
||||||
|
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
||||||
|
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
|
||||||
|
"\n",
|
||||||
|
" print()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Advanced usage\n",
|
||||||
|
"\n",
|
||||||
|
"For more control on the chunking, we can parametrize through the `HybridChunker`\n",
|
||||||
|
"arguments illustrated below.\n",
|
||||||
|
"\n",
|
||||||
|
"Notice how `tokenizer` and `embed_model` further below are single-sourced from\n",
|
||||||
|
"`EMBED_MODEL_ID`.\n",
|
||||||
|
"This is important for making sure the chunker and the embedding model are using the same\n",
|
||||||
|
"tokenizer."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from transformers import AutoTokenizer\n",
|
"from transformers import AutoTokenizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.chunking import HybridChunker\n",
|
"from docling.chunking import HybridChunker\n",
|
||||||
"\n",
|
"\n",
|
||||||
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||||
"MAX_TOKENS = 64\n",
|
"MAX_TOKENS = 64 # set to a small number for illustrative purposes\n",
|
||||||
"\n",
|
"\n",
|
||||||
"tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n",
|
"tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"chunker = HybridChunker(\n",
|
"chunker = HybridChunker(\n",
|
||||||
" tokenizer=tokenizer, # can also just pass model name instead of tokenizer instance\n",
|
" tokenizer=tokenizer, # instance or model name, defaults to \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||||
" max_tokens=MAX_TOKENS, # optional, by default derived from `tokenizer`\n",
|
" max_tokens=MAX_TOKENS, # optional, by default derived from `tokenizer`\n",
|
||||||
" # merge_peers=True, # optional, defaults to True\n",
|
" merge_peers=True, # optional, defaults to True\n",
|
||||||
")\n",
|
")\n",
|
||||||
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
"chunks = list(chunk_iter)"
|
"chunks = list(chunk_iter)"
|
||||||
@ -88,7 +189,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Points to notice:\n",
|
"Points to notice looking at the output chunks below:\n",
|
||||||
"- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
|
"- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
|
||||||
"- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
|
"- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
|
||||||
"- Where possible, we merge undersized peer chunks (see chunk 0)\n",
|
"- Where possible, we merge undersized peer chunks (see chunk 0)\n",
|
||||||
@ -97,7 +198,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -245,174 +346,6 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Vector Retrieval"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
|
||||||
"To disable this warning, you can either:\n",
|
|
||||||
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
|
||||||
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from sentence_transformers import SentenceTransformer\n",
|
|
||||||
"\n",
|
|
||||||
"embed_model = SentenceTransformer(EMBED_MODEL_ID)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>vector</th>\n",
|
|
||||||
" <th>text</th>\n",
|
|
||||||
" <th>headings</th>\n",
|
|
||||||
" <th>captions</th>\n",
|
|
||||||
" <th>_distance</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>[-0.1269039, -0.01948185, -0.07718097, -0.1116...</td>\n",
|
|
||||||
" <td>language, and the UPC barcode. The company has...</td>\n",
|
|
||||||
" <td>[IBM]</td>\n",
|
|
||||||
" <td>None</td>\n",
|
|
||||||
" <td>1.164613</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>[-0.10198064, 0.0055981805, -0.05095279, -0.13...</td>\n",
|
|
||||||
" <td>IBM originated with several technological inno...</td>\n",
|
|
||||||
" <td>[IBM, 1910s–1950s]</td>\n",
|
|
||||||
" <td>None</td>\n",
|
|
||||||
" <td>1.245144</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>[-0.057121325, -0.034115084, -0.018113216, -0....</td>\n",
|
|
||||||
" <td>As one of the world's oldest and largest techn...</td>\n",
|
|
||||||
" <td>[IBM]</td>\n",
|
|
||||||
" <td>None</td>\n",
|
|
||||||
" <td>1.355586</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>[-0.04429054, -0.058111433, -0.009330196, -0.0...</td>\n",
|
|
||||||
" <td>IBM is the largest industrial research organiz...</td>\n",
|
|
||||||
" <td>[IBM]</td>\n",
|
|
||||||
" <td>None</td>\n",
|
|
||||||
" <td>1.398617</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>[-0.11920792, 0.053496413, -0.042391937, -0.03...</td>\n",
|
|
||||||
" <td>Awards.[16]</td>\n",
|
|
||||||
" <td>[IBM]</td>\n",
|
|
||||||
" <td>None</td>\n",
|
|
||||||
" <td>1.446295</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" vector \\\n",
|
|
||||||
"0 [-0.1269039, -0.01948185, -0.07718097, -0.1116... \n",
|
|
||||||
"1 [-0.10198064, 0.0055981805, -0.05095279, -0.13... \n",
|
|
||||||
"2 [-0.057121325, -0.034115084, -0.018113216, -0.... \n",
|
|
||||||
"3 [-0.04429054, -0.058111433, -0.009330196, -0.0... \n",
|
|
||||||
"4 [-0.11920792, 0.053496413, -0.042391937, -0.03... \n",
|
|
||||||
"\n",
|
|
||||||
" text headings \\\n",
|
|
||||||
"0 language, and the UPC barcode. The company has... [IBM] \n",
|
|
||||||
"1 IBM originated with several technological inno... [IBM, 1910s–1950s] \n",
|
|
||||||
"2 As one of the world's oldest and largest techn... [IBM] \n",
|
|
||||||
"3 IBM is the largest industrial research organiz... [IBM] \n",
|
|
||||||
"4 Awards.[16] [IBM] \n",
|
|
||||||
"\n",
|
|
||||||
" captions _distance \n",
|
|
||||||
"0 None 1.164613 \n",
|
|
||||||
"1 None 1.245144 \n",
|
|
||||||
"2 None 1.355586 \n",
|
|
||||||
"3 None 1.398617 \n",
|
|
||||||
"4 None 1.446295 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from pathlib import Path\n",
|
|
||||||
"from tempfile import mkdtemp\n",
|
|
||||||
"\n",
|
|
||||||
"import lancedb\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def make_lancedb_index(db_uri, index_name, chunks, embedding_model):\n",
|
|
||||||
" db = lancedb.connect(db_uri)\n",
|
|
||||||
" data = []\n",
|
|
||||||
" for chunk in chunks:\n",
|
|
||||||
" embeddings = embedding_model.encode(chunker.serialize(chunk=chunk))\n",
|
|
||||||
" data_item = {\n",
|
|
||||||
" \"vector\": embeddings,\n",
|
|
||||||
" \"text\": chunk.text,\n",
|
|
||||||
" \"headings\": chunk.meta.headings,\n",
|
|
||||||
" \"captions\": chunk.meta.captions,\n",
|
|
||||||
" }\n",
|
|
||||||
" data.append(data_item)\n",
|
|
||||||
" tbl = db.create_table(index_name, data=data, exist_ok=True)\n",
|
|
||||||
" return tbl\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"db_uri = str(Path(mkdtemp()) / \"docling.db\")\n",
|
|
||||||
"index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n",
|
|
||||||
"\n",
|
|
||||||
"sample_query = \"invent\"\n",
|
|
||||||
"sample_embedding = embed_model.encode(sample_query)\n",
|
|
||||||
"results = index.search(sample_embedding).limit(5)\n",
|
|
||||||
"\n",
|
|
||||||
"results.to_pandas()"
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -14,6 +14,17 @@
|
|||||||
"# RAG with Haystack"
|
"# RAG with Haystack"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"| Step | Tech | Execution | \n",
|
||||||
|
"| --- | --- | --- |\n",
|
||||||
|
"| Embedding | Hugging Face / Sentence Transformers | 💻 Local |\n",
|
||||||
|
"| Vector store | Milvus | 💻 Local |\n",
|
||||||
|
"| Gen AI | Hugging Face Inference API | 🌐 Remote | "
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -26,7 +37,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"This example leverages the\n",
|
"This example leverages the\n",
|
||||||
"[Haystack Docling extension](https://github.com/DS4SD/docling-haystack), along with\n",
|
"[Haystack Docling extension](../../integrations/haystack/), along with\n",
|
||||||
"Milvus-based document store and retriever instances, as well as sentence-transformers\n",
|
"Milvus-based document store and retriever instances, as well as sentence-transformers\n",
|
||||||
"embeddings.\n",
|
"embeddings.\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -90,6 +101,7 @@
|
|||||||
"from docling_haystack.converter import ExportType\n",
|
"from docling_haystack.converter import ExportType\n",
|
||||||
"from dotenv import load_dotenv\n",
|
"from dotenv import load_dotenv\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"def _get_env_from_colab_or_os(key):\n",
|
"def _get_env_from_colab_or_os(key):\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" from google.colab import userdata\n",
|
" from google.colab import userdata\n",
|
||||||
@ -102,6 +114,7 @@
|
|||||||
" pass\n",
|
" pass\n",
|
||||||
" return os.getenv(key)\n",
|
" return os.getenv(key)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"load_dotenv()\n",
|
"load_dotenv()\n",
|
||||||
"HF_TOKEN = _get_env_from_colab_or_os(\"HF_TOKEN\")\n",
|
"HF_TOKEN = _get_env_from_colab_or_os(\"HF_TOKEN\")\n",
|
||||||
"PATHS = [\"https://arxiv.org/pdf/2408.09869\"] # Docling Technical Report\n",
|
"PATHS = [\"https://arxiv.org/pdf/2408.09869\"] # Docling Technical Report\n",
|
||||||
|
@ -4,7 +4,63 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# RAG with LangChain 🦜🔗"
|
"<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_langchain.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# RAG with LangChain"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"| Step | Tech | Execution | \n",
|
||||||
|
"| --- | --- | --- |\n",
|
||||||
|
"| Embedding | Hugging Face / Sentence Transformers | 💻 Local |\n",
|
||||||
|
"| Vector store | Milvus | 💻 Local |\n",
|
||||||
|
"| Gen AI | Hugging Face Inference API | 🌐 Remote | "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This example leverages the\n",
|
||||||
|
"[LangChain Docling integration](../../integrations/langchain/), along with a Milvus\n",
|
||||||
|
"vector store, as well as sentence-transformers embeddings.\n",
|
||||||
|
"\n",
|
||||||
|
"The presented `DoclingLoader` component enables you to:\n",
|
||||||
|
"- use various document types in your LLM applications with ease and speed, and\n",
|
||||||
|
"- leverage Docling's rich format for advanced, document-native grounding.\n",
|
||||||
|
"\n",
|
||||||
|
"`DoclingLoader` supports two different export modes:\n",
|
||||||
|
"- `ExportType.MARKDOWN`: if you want to capture each input document as a separate\n",
|
||||||
|
" LangChain document, or\n",
|
||||||
|
"- `ExportType.DOC_CHUNKS` (default): if you want to have each input document chunked and\n",
|
||||||
|
" to then capture each individual chunk as a separate LangChain document downstream.\n",
|
||||||
|
"\n",
|
||||||
|
"The example allows exploring both modes via parameter `EXPORT_TYPE`; depending on the\n",
|
||||||
|
"value set, the example pipeline is then set up accordingly."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setup"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"- 👉 For best conversion speed, use GPU acceleration whenever available; e.g. if running on Colab, use GPU-enabled runtime.\n",
|
||||||
|
"- Notebook uses HuggingFace's Inference API; for increased LLM quota, token can be provided via env var `HF_TOKEN`.\n",
|
||||||
|
"- Requirements can be installed as shown below (`--no-warn-conflicts` meant for Colab's pre-populated Python env; feel free to remove for stricter usage):"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -21,81 +77,105 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# requirements for this example:\n",
|
"%pip install -q --progress-bar off --no-warn-conflicts langchain-docling langchain-core langchain-huggingface langchain_milvus langchain python-dotenv"
|
||||||
"%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"True"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"from tempfile import mkdtemp\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from dotenv import load_dotenv\n",
|
"from dotenv import load_dotenv\n",
|
||||||
|
"from langchain_core.prompts import PromptTemplate\n",
|
||||||
|
"from langchain_docling.loader import ExportType\n",
|
||||||
"\n",
|
"\n",
|
||||||
"load_dotenv()"
|
"\n",
|
||||||
|
"def _get_env_from_colab_or_os(key):\n",
|
||||||
|
" try:\n",
|
||||||
|
" from google.colab import userdata\n",
|
||||||
|
"\n",
|
||||||
|
" try:\n",
|
||||||
|
" return userdata.get(key)\n",
|
||||||
|
" except userdata.SecretNotFoundError:\n",
|
||||||
|
" pass\n",
|
||||||
|
" except ImportError:\n",
|
||||||
|
" pass\n",
|
||||||
|
" return os.getenv(key)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"load_dotenv()\n",
|
||||||
|
"\n",
|
||||||
|
"# https://github.com/huggingface/transformers/issues/5486:\n",
|
||||||
|
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
||||||
|
"\n",
|
||||||
|
"HF_TOKEN = _get_env_from_colab_or_os(\"HF_TOKEN\")\n",
|
||||||
|
"FILE_PATH = [\"https://arxiv.org/pdf/2408.09869\"] # Docling Technical Report\n",
|
||||||
|
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||||
|
"GEN_MODEL_ID = \"mistralai/Mixtral-8x7B-Instruct-v0.1\"\n",
|
||||||
|
"EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
|
||||||
|
"QUESTION = \"Which are the main AI models in Docling?\"\n",
|
||||||
|
"PROMPT = PromptTemplate.from_template(\n",
|
||||||
|
" \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {input}\\nAnswer:\\n\",\n",
|
||||||
|
")\n",
|
||||||
|
"TOP_K = 3\n",
|
||||||
|
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Setup"
|
"## Document loading\n",
|
||||||
]
|
"\n",
|
||||||
},
|
"Now we can instantiate our loader and load documents."
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Loader and splitter"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Below we set up:\n",
|
|
||||||
"- a `Loader` which will be used to create LangChain documents, and\n",
|
|
||||||
"- a splitter, which will be used to split these documents"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Token indices sequence length is longer than the specified maximum sequence length for this model (1041 > 512). Running this sequence through the model will result in indexing errors\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from typing import Iterator\n",
|
"from langchain_docling import DoclingLoader\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from langchain_core.document_loaders import BaseLoader\n",
|
"from docling.chunking import HybridChunker\n",
|
||||||
"from langchain_core.documents import Document as LCDocument\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"loader = DoclingLoader(\n",
|
||||||
|
" file_path=FILE_PATH,\n",
|
||||||
|
" export_type=EXPORT_TYPE,\n",
|
||||||
|
" chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class DoclingPDFLoader(BaseLoader):\n",
|
"docs = loader.load()"
|
||||||
"\n",
|
]
|
||||||
" def __init__(self, file_path: str | list[str]) -> None:\n",
|
},
|
||||||
" self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
|
{
|
||||||
" self._converter = DocumentConverter()\n",
|
"cell_type": "markdown",
|
||||||
"\n",
|
"metadata": {},
|
||||||
" def lazy_load(self) -> Iterator[LCDocument]:\n",
|
"source": [
|
||||||
" for source in self._file_paths:\n",
|
"> Note: a message saying `\"Token indices sequence length is longer than the specified\n",
|
||||||
" dl_doc = self._converter.convert(source).document\n",
|
"maximum sequence length...\"` can be ignored in this case — details\n",
|
||||||
" text = dl_doc.export_to_markdown()\n",
|
"[here](https://github.com/DS4SD/docling-core/issues/119#issuecomment-2577418826)."
|
||||||
" yield LCDocument(page_content=text)"
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Determining the splits:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -104,29 +184,57 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"FILE_PATH = \"https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf\" # DocLayNet paper"
|
"if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
||||||
]
|
" splits = docs\n",
|
||||||
},
|
"elif EXPORT_TYPE == ExportType.MARKDOWN:\n",
|
||||||
{
|
" from langchain_text_splitters import MarkdownHeaderTextSplitter\n",
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"loader = DoclingPDFLoader(file_path=FILE_PATH)\n",
|
" splitter = MarkdownHeaderTextSplitter(\n",
|
||||||
"text_splitter = RecursiveCharacterTextSplitter(\n",
|
" headers_to_split_on=[\n",
|
||||||
" chunk_size=1000,\n",
|
" (\"#\", \"Header_1\"),\n",
|
||||||
" chunk_overlap=200,\n",
|
" (\"##\", \"Header_2\"),\n",
|
||||||
")"
|
" (\"###\", \"Header_3\"),\n",
|
||||||
|
" ],\n",
|
||||||
|
" )\n",
|
||||||
|
" splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]\n",
|
||||||
|
"else:\n",
|
||||||
|
" raise ValueError(f\"Unexpected export type: {EXPORT_TYPE}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"We now used the above-defined objects to get the document splits:"
|
"Inspecting some sample splits:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"- d.page_content='arXiv:2408.09869v5 [cs.CL] 9 Dec 2024'\n",
|
||||||
|
"- d.page_content='Docling Technical Report\\nVersion 1.0\\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\\nAI4K Group, IBM Research R¨uschlikon, Switzerland'\n",
|
||||||
|
"- d.page_content='Abstract\\nThis technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.'\n",
|
||||||
|
"...\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for d in splits[:3]:\n",
|
||||||
|
" print(f\"- {d.page_content=}\")\n",
|
||||||
|
"print(\"...\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Ingestion"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -135,93 +243,27 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"docs = loader.load()\n",
|
"import json\n",
|
||||||
"splits = text_splitter.split_documents(docs)"
|
"from pathlib import Path\n",
|
||||||
]
|
"from tempfile import mkdtemp\n",
|
||||||
},
|
"\n",
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Embeddings"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
|
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
|
||||||
"\n",
|
|
||||||
"HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
|
|
||||||
"embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Vector store"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from tempfile import TemporaryDirectory\n",
|
|
||||||
"\n",
|
|
||||||
"from langchain_milvus import Milvus\n",
|
"from langchain_milvus import Milvus\n",
|
||||||
"\n",
|
"\n",
|
||||||
"MILVUS_URI = os.environ.get(\n",
|
"embedding = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)\n",
|
||||||
" \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"milvus_uri = str(Path(mkdtemp()) / \"docling.db\") # or set as needed\n",
|
||||||
"vectorstore = Milvus.from_documents(\n",
|
"vectorstore = Milvus.from_documents(\n",
|
||||||
" splits,\n",
|
" documents=splits,\n",
|
||||||
" embeddings,\n",
|
" embedding=embedding,\n",
|
||||||
" connection_args={\"uri\": MILVUS_URI},\n",
|
" collection_name=\"docling_demo\",\n",
|
||||||
|
" connection_args={\"uri\": milvus_uri},\n",
|
||||||
|
" index_params={\"index_type\": \"FLAT\"},\n",
|
||||||
" drop_old=True,\n",
|
" drop_old=True,\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### LLM"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
|
|
||||||
"Token is valid (permission: write).\n",
|
|
||||||
"Your token has been saved to /Users/pva/.cache/huggingface/token\n",
|
|
||||||
"Login successful\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from langchain_huggingface import HuggingFaceEndpoint\n",
|
|
||||||
"\n",
|
|
||||||
"HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
|
|
||||||
"HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
|
|
||||||
"\n",
|
|
||||||
"llm = HuggingFaceEndpoint(\n",
|
|
||||||
" repo_id=HF_LLM_MODEL_ID,\n",
|
|
||||||
" huggingfacehub_api_token=HF_API_KEY,\n",
|
|
||||||
")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -231,55 +273,89 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from typing import Iterable\n",
|
"from langchain.chains import create_retrieval_chain\n",
|
||||||
|
"from langchain.chains.combine_documents import create_stuff_documents_chain\n",
|
||||||
|
"from langchain_huggingface import HuggingFaceEndpoint\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from langchain_core.documents import Document as LCDocument\n",
|
"retriever = vectorstore.as_retriever(search_kwargs={\"k\": TOP_K})\n",
|
||||||
"from langchain_core.output_parsers import StrOutputParser\n",
|
"llm = HuggingFaceEndpoint(\n",
|
||||||
"from langchain_core.prompts import PromptTemplate\n",
|
" repo_id=GEN_MODEL_ID,\n",
|
||||||
"from langchain_core.runnables import RunnablePassthrough\n",
|
" huggingfacehub_api_token=HF_TOKEN,\n",
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def format_docs(docs: Iterable[LCDocument]):\n",
|
|
||||||
" return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"retriever = vectorstore.as_retriever()\n",
|
|
||||||
"\n",
|
|
||||||
"prompt = PromptTemplate.from_template(\n",
|
|
||||||
" \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
|
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"rag_chain = (\n",
|
"\n",
|
||||||
" {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
|
"def clip_text(text, threshold=100):\n",
|
||||||
" | prompt\n",
|
" return f\"{text[:threshold]}...\" if len(text) > threshold else text"
|
||||||
" | llm\n",
|
|
||||||
" | StrOutputParser()\n",
|
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"name": "stdout",
|
||||||
"text/plain": [
|
"output_type": "stream",
|
||||||
"'- 80,863 pages were human annotated for DocLayNet.'"
|
"text": [
|
||||||
|
"Question:\n",
|
||||||
|
"Which are the main AI models in Docling?\n",
|
||||||
|
"\n",
|
||||||
|
"Answer:\n",
|
||||||
|
"Docling initially releases two AI models, a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, and TableFormer is a state-of-the-art tab...\n",
|
||||||
|
"\n",
|
||||||
|
"Source 1:\n",
|
||||||
|
" text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure re...\"\n",
|
||||||
|
" dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/50', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 3, 'bbox': {'l': 108.0, 't': 405.1419982910156, 'r': 504.00299072265625, 'b': 330.7799987792969, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 608]}]}], 'headings': ['3.2 AI models'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
|
||||||
|
" source: https://arxiv.org/pdf/2408.09869\n",
|
||||||
|
"\n",
|
||||||
|
"Source 2:\n",
|
||||||
|
" text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support ...\"\n",
|
||||||
|
" dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/26', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 108.0, 't': 273.01800537109375, 'r': 504.00299072265625, 'b': 176.83799743652344, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 796]}]}], 'headings': ['3 Processing pipeline'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
|
||||||
|
" source: https://arxiv.org/pdf/2408.09869\n",
|
||||||
|
"\n",
|
||||||
|
"Source 3:\n",
|
||||||
|
" text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of ...\"\n",
|
||||||
|
" dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/76', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 322.468994140625, 'r': 504.00299072265625, 'b': 259.0169982910156, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 543]}]}, {'self_ref': '#/texts/77', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 251.6540069580078, 'r': 504.00299072265625, 'b': 198.99200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 402]}]}], 'headings': ['6 Future work and contributions'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
|
||||||
|
" source: https://arxiv.org/pdf/2408.09869\n"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"execution_count": 11,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
|
"question_answer_chain = create_stuff_documents_chain(llm, PROMPT)\n",
|
||||||
|
"rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n",
|
||||||
|
"resp_dict = rag_chain.invoke({\"input\": QUESTION})\n",
|
||||||
|
"\n",
|
||||||
|
"clipped_answer = clip_text(resp_dict[\"answer\"], threshold=200)\n",
|
||||||
|
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
|
||||||
|
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
|
||||||
|
" print()\n",
|
||||||
|
" print(f\"Source {i+1}:\")\n",
|
||||||
|
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
||||||
|
" for key in doc.metadata:\n",
|
||||||
|
" if key != \"pk\":\n",
|
||||||
|
" val = doc.metadata.get(key)\n",
|
||||||
|
" clipped_val = clip_text(val) if isinstance(val, str) else val\n",
|
||||||
|
" print(f\" {key}: {clipped_val}\")"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -298,7 +374,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.4"
|
"version": "3.12.8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -11,7 +11,18 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# RAG with LlamaIndex 🦙"
|
"# RAG with LlamaIndex"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"| Step | Tech | Execution | \n",
|
||||||
|
"| --- | --- | --- |\n",
|
||||||
|
"| Embedding | Hugging Face / Sentence Transformers | 💻 Local |\n",
|
||||||
|
"| Vector store | Milvus | 💻 Local |\n",
|
||||||
|
"| Gen AI | Hugging Face Inference API | 🌐 Remote | "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -462,7 +473,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.4"
|
"version": "3.12.7"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
752
docs/examples/rag_weaviate.ipynb
Normal file
752
docs/examples/rag_weaviate.ipynb
Normal file
@ -0,0 +1,752 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"[](https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_weaviate.ipynb)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "Ag9kcX2B_atc"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# RAG with Weaviate"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"| Step | Tech | Execution | \n",
|
||||||
|
"| --- | --- | --- |\n",
|
||||||
|
"| Embedding | Open AI | 🌐 Remote |\n",
|
||||||
|
"| Vector store | Weavieate | 💻 Local |\n",
|
||||||
|
"| Gen AI | Open AI | 🌐 Remote |\n",
|
||||||
|
"\n",
|
||||||
|
"## A recipe 🧑🍳 🐥 💚\n",
|
||||||
|
"\n",
|
||||||
|
"This is a code recipe that uses [Weaviate](https://weaviate.io/) to perform RAG over PDF documents parsed by [Docling](https://ds4sd.github.io/docling/).\n",
|
||||||
|
"\n",
|
||||||
|
"In this notebook, we accomplish the following:\n",
|
||||||
|
"* Parse the top machine learning papers on [arXiv](https://arxiv.org/) using Docling\n",
|
||||||
|
"* Perform hierarchical chunking of the documents using Docling\n",
|
||||||
|
"* Generate text embeddings with OpenAI\n",
|
||||||
|
"* Perform RAG using [Weaviate](https://weaviate.io/developers/weaviate/search/generative)\n",
|
||||||
|
"\n",
|
||||||
|
"To run this notebook, you'll need:\n",
|
||||||
|
"* An [OpenAI API key](https://platform.openai.com/docs/quickstart)\n",
|
||||||
|
"* Access to GPU/s\n",
|
||||||
|
"\n",
|
||||||
|
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
|
||||||
|
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
|
||||||
|
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "4YgT7tpXCUl0"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Install Docling and Weaviate client\n",
|
||||||
|
"\n",
|
||||||
|
"Note: If Colab prompts you to restart the session after running the cell below, click \"restart\" and proceed with running the rest of the notebook."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true,
|
||||||
|
"id": "u076oUSF_YUG"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%%capture\n",
|
||||||
|
"%pip install docling~=\"2.7.0\"\n",
|
||||||
|
"%pip install -U weaviate-client~=\"4.9.4\"\n",
|
||||||
|
"%pip install rich\n",
|
||||||
|
"%pip install torch\n",
|
||||||
|
"\n",
|
||||||
|
"import warnings\n",
|
||||||
|
"\n",
|
||||||
|
"warnings.filterwarnings(\"ignore\")\n",
|
||||||
|
"\n",
|
||||||
|
"import logging\n",
|
||||||
|
"\n",
|
||||||
|
"# Suppress Weaviate client logs\n",
|
||||||
|
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "2q2F9RUmR8Wj"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## 🐥 Part 1: Docling\n",
|
||||||
|
"\n",
|
||||||
|
"Part of what makes Docling so remarkable is the fact that it can run on commodity hardware. This means that this notebook can be run on a local machine with GPU acceleration. If you're using a MacBook with a silicon chip, Docling integrates seamlessly with Metal Performance Shaders (MPS). MPS provides out-of-the-box GPU acceleration for macOS, seamlessly integrating with PyTorch and TensorFlow, offering energy-efficient performance on Apple Silicon, and broad compatibility with all Metal-supported GPUs.\n",
|
||||||
|
"\n",
|
||||||
|
"The code below checks to see if a GPU is available, either via CUDA or MPS."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"MPS GPU is enabled.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"\n",
|
||||||
|
"# Check if GPU or MPS is available\n",
|
||||||
|
"if torch.cuda.is_available():\n",
|
||||||
|
" device = torch.device(\"cuda\")\n",
|
||||||
|
" print(f\"CUDA GPU is enabled: {torch.cuda.get_device_name(0)}\")\n",
|
||||||
|
"elif torch.backends.mps.is_available():\n",
|
||||||
|
" device = torch.device(\"mps\")\n",
|
||||||
|
" print(\"MPS GPU is enabled.\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" raise EnvironmentError(\n",
|
||||||
|
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "wHTsy4a8JFPl"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"Here, we've collected 10 influential machine learning papers published as PDFs on arXiv. Because Docling does not yet have title extraction for PDFs, we manually add the titles in a corresponding list.\n",
|
||||||
|
"\n",
|
||||||
|
"Note: Converting all 10 papers should take around 8 minutes with a T4 GPU."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {
|
||||||
|
"id": "Vy5SMPiGDMy-"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Influential machine learning papers\n",
|
||||||
|
"source_urls = [\n",
|
||||||
|
" \"https://arxiv.org/pdf/1706.03762\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1810.04805\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1406.2661\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1409.0473\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1412.6980\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1312.6114\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1312.5602\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1512.03385\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1409.3215\",\n",
|
||||||
|
" \"https://arxiv.org/pdf/1301.3781\",\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# And their corresponding titles (because Docling doesn't have title extraction yet!)\n",
|
||||||
|
"source_titles = [\n",
|
||||||
|
" \"Attention Is All You Need\",\n",
|
||||||
|
" \"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\",\n",
|
||||||
|
" \"Generative Adversarial Nets\",\n",
|
||||||
|
" \"Neural Machine Translation by Jointly Learning to Align and Translate\",\n",
|
||||||
|
" \"Adam: A Method for Stochastic Optimization\",\n",
|
||||||
|
" \"Auto-Encoding Variational Bayes\",\n",
|
||||||
|
" \"Playing Atari with Deep Reinforcement Learning\",\n",
|
||||||
|
" \"Deep Residual Learning for Image Recognition\",\n",
|
||||||
|
" \"Sequence to Sequence Learning with Neural Networks\",\n",
|
||||||
|
" \"A Neural Probabilistic Language Model\",\n",
|
||||||
|
"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "5fi8wzHrCoLa"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Convert PDFs to Docling documents\n",
|
||||||
|
"\n",
|
||||||
|
"Here we use Docling's `.convert_all()` to parse a batch of PDFs. The result is a list of Docling documents that we can use for text extraction.\n",
|
||||||
|
"\n",
|
||||||
|
"Note: Please ignore the `ERR#` message."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/",
|
||||||
|
"height": 67,
|
||||||
|
"referenced_widgets": [
|
||||||
|
"6d049f786a2f4ad7857a6cf2d95b5ba2",
|
||||||
|
"db2a7b9f549e4f0fb1ff3fce655d76a2",
|
||||||
|
"630967a2db4c4714b4c15d1358a0fcae",
|
||||||
|
"b3da9595ab7c4995a00e506e7b5202e3",
|
||||||
|
"243ecaf36ee24cafbd1c33d148f2ca78",
|
||||||
|
"5b7e22df1b464ca894126736e6f72207",
|
||||||
|
"02f6af5993bb4a6a9dbca77952f675d2",
|
||||||
|
"dea323b3de0e43118f338842c94ac065",
|
||||||
|
"bd198d2c0c4c4933a6e6544908d0d846",
|
||||||
|
"febd5c498e4f4f5dbde8dec3cd935502",
|
||||||
|
"ab4f282c0d37451092c60e6566e8e945"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"id": "Sr44xGR1PNSc",
|
||||||
|
"outputId": "b5cca9ee-d7c0-4c8f-c18a-0ac4787984e9"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 84072.91it/s]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"ERR#: COULD NOT CONVERT TO RS THIS TABLE TO COMPUTE SPANS\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling.datamodel.document import ConversionResult\n",
|
||||||
|
"from docling.document_converter import DocumentConverter\n",
|
||||||
|
"\n",
|
||||||
|
"# Instantiate the doc converter\n",
|
||||||
|
"doc_converter = DocumentConverter()\n",
|
||||||
|
"\n",
|
||||||
|
"# Directly pass list of files or streams to `convert_all`\n",
|
||||||
|
"conv_results_iter = doc_converter.convert_all(source_urls) # previously `convert`\n",
|
||||||
|
"\n",
|
||||||
|
"# Iterate over the generator to get a list of Docling documents\n",
|
||||||
|
"docs = [result.document for result in conv_results_iter]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "xHun_P-OCtKd"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Post-process extracted document data\n",
|
||||||
|
"#### Perform hierarchical chunking on documents\n",
|
||||||
|
"\n",
|
||||||
|
"We use Docling's `HierarchicalChunker()` to perform hierarchy-aware chunking of our list of documents. This is meant to preserve some of the structure and relationships within the document, which enables more accurate and relevant retrieval in our RAG pipeline."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"id": "L17ju9xibuIo"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.chunker import HierarchicalChunker\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize lists for text, and titles\n",
|
||||||
|
"texts, titles = [], []\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HierarchicalChunker()\n",
|
||||||
|
"\n",
|
||||||
|
"# Process each document in the list\n",
|
||||||
|
"for doc, title in zip(docs, source_titles): # Pair each document with its title\n",
|
||||||
|
" chunks = list(\n",
|
||||||
|
" chunker.chunk(doc)\n",
|
||||||
|
" ) # Perform hierarchical chunking and get text from chunks\n",
|
||||||
|
" for chunk in chunks:\n",
|
||||||
|
" texts.append(chunk.text)\n",
|
||||||
|
" titles.append(title)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "khbU9R1li2Kj"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"Because we're splitting the documents into chunks, we'll concatenate the article title to the beginning of each chunk for additional context."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {
|
||||||
|
"id": "HNwYV9P57OwF"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Concatenate title and text\n",
|
||||||
|
"for i in range(len(texts)):\n",
|
||||||
|
" texts[i] = f\"{titles[i]} {texts[i]}\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "uhLlCpQODaT3"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## 💚 Part 2: Weaviate\n",
|
||||||
|
"### Create and configure an embedded Weaviate collection"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "ho7xYQTZK5Wk"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"We'll be using the OpenAI API for both generating the text embeddings and for the generative model in our RAG pipeline. The code below dynamically fetches your API key based on whether you're running this notebook in Google Colab and running it as a regular Jupyter notebook. All you need to do is replace `openai_api_key_var` with the name of your environmental variable name or Colab secret name for the API key.\n",
|
||||||
|
"\n",
|
||||||
|
"If you're running this notebook in Google Colab, make sure you [add](https://medium.com/@parthdasawant/how-to-use-secrets-in-google-colab-450c38e3ec75) your API key as a secret."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {
|
||||||
|
"id": "PD53jOT4roj2"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# OpenAI API key variable name\n",
|
||||||
|
"openai_api_key_var = \"OPENAI_API_KEY\" # Replace with the name of your secret/env var\n",
|
||||||
|
"\n",
|
||||||
|
"# Fetch OpenAI API key\n",
|
||||||
|
"try:\n",
|
||||||
|
" # If running in Colab, fetch API key from Secrets\n",
|
||||||
|
" import google.colab\n",
|
||||||
|
" from google.colab import userdata\n",
|
||||||
|
"\n",
|
||||||
|
" openai_api_key = userdata.get(openai_api_key_var)\n",
|
||||||
|
" if not openai_api_key:\n",
|
||||||
|
" raise ValueError(f\"Secret '{openai_api_key_var}' not found in Colab secrets.\")\n",
|
||||||
|
"except ImportError:\n",
|
||||||
|
" # If not running in Colab, fetch API key from environment variable\n",
|
||||||
|
" import os\n",
|
||||||
|
"\n",
|
||||||
|
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
||||||
|
" if not openai_api_key:\n",
|
||||||
|
" raise EnvironmentError(\n",
|
||||||
|
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
||||||
|
" \"Please define it before running this script.\"\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "8G5jZSh6ti3e"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"[Embedded Weaviate](https://weaviate.io/developers/weaviate/installation/embedded) allows you to spin up a Weaviate instance directly from your application code, without having to use a Docker container. If you're interested in other deployment methods, like using Docker-Compose or Kubernetes, check out this [page](https://weaviate.io/developers/weaviate/installation) in the Weaviate docs."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "hFUBEZiJUMic",
|
||||||
|
"outputId": "0b6534c9-66c9-4a47-9754-103bcc030019"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import weaviate\n",
|
||||||
|
"\n",
|
||||||
|
"# Connect to Weaviate embedded\n",
|
||||||
|
"client = weaviate.connect_to_embedded(headers={\"X-OpenAI-Api-Key\": openai_api_key})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "4nu9qM75hrsd"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import weaviate.classes.config as wc\n",
|
||||||
|
"from weaviate.classes.config import DataType, Property\n",
|
||||||
|
"\n",
|
||||||
|
"# Define the collection name\n",
|
||||||
|
"collection_name = \"docling\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Delete the collection if it already exists\n",
|
||||||
|
"if client.collections.exists(collection_name):\n",
|
||||||
|
" client.collections.delete(collection_name)\n",
|
||||||
|
"\n",
|
||||||
|
"# Create the collection\n",
|
||||||
|
"collection = client.collections.create(\n",
|
||||||
|
" name=collection_name,\n",
|
||||||
|
" vectorizer_config=wc.Configure.Vectorizer.text2vec_openai(\n",
|
||||||
|
" model=\"text-embedding-3-large\", # Specify your embedding model here\n",
|
||||||
|
" ),\n",
|
||||||
|
" # Enable generative model from Cohere\n",
|
||||||
|
" generative_config=wc.Configure.Generative.openai(\n",
|
||||||
|
" model=\"gpt-4o\" # Specify your generative model for RAG here\n",
|
||||||
|
" ),\n",
|
||||||
|
" # Define properties of metadata\n",
|
||||||
|
" properties=[\n",
|
||||||
|
" wc.Property(name=\"text\", data_type=wc.DataType.TEXT),\n",
|
||||||
|
" wc.Property(name=\"title\", data_type=wc.DataType.TEXT, skip_vectorization=True),\n",
|
||||||
|
" ],\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "RgMcZDB9Dzfs"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Wrangle data into an acceptable format for Weaviate\n",
|
||||||
|
"\n",
|
||||||
|
"Transform our data from lists to a list of dictionaries for insertion into our Weaviate collection."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {
|
||||||
|
"id": "kttDgwZEsIJQ"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Initialize the data object\n",
|
||||||
|
"data = []\n",
|
||||||
|
"\n",
|
||||||
|
"# Create a dictionary for each row by iterating through the corresponding lists\n",
|
||||||
|
"for text, title in zip(texts, titles):\n",
|
||||||
|
" data_point = {\n",
|
||||||
|
" \"text\": text,\n",
|
||||||
|
" \"title\": title,\n",
|
||||||
|
" }\n",
|
||||||
|
" data.append(data_point)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "-4amqRaoD5g0"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Insert data into Weaviate and generate embeddings\n",
|
||||||
|
"\n",
|
||||||
|
"Embeddings will be generated upon insertion to our Weaviate collection."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "g8VCYnhbaxcz",
|
||||||
|
"outputId": "cc900e56-9fb6-4d4e-ab18-ebd12b1f4201"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Insert text chunks and metadata into vector DB collection\n",
|
||||||
|
"response = collection.data.insert_many(data)\n",
|
||||||
|
"\n",
|
||||||
|
"if response.has_errors:\n",
|
||||||
|
" print(response.errors)\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"Insert complete.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "KI01PxjuD_XR"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Query the data\n",
|
||||||
|
"\n",
|
||||||
|
"Here, we perform a simple similarity search to return the most similar embedded chunks to our search query."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "zbz6nWJc5CSj",
|
||||||
|
"outputId": "16aced21-4496-4c91-cc12-d5c9ac983351"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'text': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding A distinctive feature of BERT is its unified architecture across different tasks. There is mini-', 'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'}\n",
|
||||||
|
"0.6578550338745117\n",
|
||||||
|
"{'text': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding We introduce a new language representation model called BERT , which stands for B idirectional E ncoder R epresentations from T ransformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications.', 'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'}\n",
|
||||||
|
"0.6696287989616394\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from weaviate.classes.query import MetadataQuery\n",
|
||||||
|
"\n",
|
||||||
|
"response = collection.query.near_text(\n",
|
||||||
|
" query=\"bert\",\n",
|
||||||
|
" limit=2,\n",
|
||||||
|
" return_metadata=MetadataQuery(distance=True),\n",
|
||||||
|
" return_properties=[\"text\", \"title\"],\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"for o in response.objects:\n",
|
||||||
|
" print(o.properties)\n",
|
||||||
|
" print(o.metadata.distance)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "elo32iMnEC18"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Perform RAG on parsed articles\n",
|
||||||
|
"\n",
|
||||||
|
"Weaviate's `generate` module allows you to perform RAG over your embedded data without having to use a separate framework.\n",
|
||||||
|
"\n",
|
||||||
|
"We specify a prompt that includes the field we want to search through in the database (in this case it's `text`), a query that includes our search term, and the number of retrieved results to use in the generation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/",
|
||||||
|
"height": 233
|
||||||
|
},
|
||||||
|
"id": "7r2LMSX9bO4y",
|
||||||
|
"outputId": "84639adf-7783-4d43-94d9-711fb313a168"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╭──────────────────────────────────────────────────── Prompt ─────────────────────────────────────────────────────╮</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span> Explain how bert works, using only the retrieved context. <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"\u001b[1;31m╭─\u001b[0m\u001b[1;31m───────────────────────────────────────────────────\u001b[0m\u001b[1;31m Prompt \u001b[0m\u001b[1;31m────────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n",
|
||||||
|
"\u001b[1;31m│\u001b[0m Explain how bert works, using only the retrieved context. \u001b[1;31m│\u001b[0m\n",
|
||||||
|
"\u001b[1;31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭─────────────────────────────────────────────── Generated Content ───────────────────────────────────────────────╮</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> BERT, which stands for Bidirectional Encoder Representations from Transformers, is a language representation <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> model designed to pretrain deep bidirectional representations from unlabeled text. It conditions on both left <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> and right context in all layers, unlike traditional left-to-right or right-to-left language models. This <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> pre-training involves two unsupervised tasks. The pre-trained BERT model can then be fine-tuned with just one <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> additional output layer to create state-of-the-art models for various tasks, such as question answering and <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> language inference, without needing substantial task-specific architecture modifications. A distinctive feature <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> of BERT is its unified architecture across different tasks. <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"\u001b[1;32m╭─\u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m Generated Content \u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m BERT, which stands for Bidirectional Encoder Representations from Transformers, is a language representation \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m model designed to pretrain deep bidirectional representations from unlabeled text. It conditions on both left \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m and right context in all layers, unlike traditional left-to-right or right-to-left language models. This \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m pre-training involves two unsupervised tasks. The pre-trained BERT model can then be fine-tuned with just one \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m additional output layer to create state-of-the-art models for various tasks, such as question answering and \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m language inference, without needing substantial task-specific architecture modifications. A distinctive feature \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m of BERT is its unified architecture across different tasks. \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from rich.console import Console\n",
|
||||||
|
"from rich.panel import Panel\n",
|
||||||
|
"\n",
|
||||||
|
"# Create a prompt where context from the Weaviate collection will be injected\n",
|
||||||
|
"prompt = \"Explain how {text} works, using only the retrieved context.\"\n",
|
||||||
|
"query = \"bert\"\n",
|
||||||
|
"\n",
|
||||||
|
"response = collection.generate.near_text(\n",
|
||||||
|
" query=query, limit=3, grouped_task=prompt, return_properties=[\"text\", \"title\"]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Prettify the output using Rich\n",
|
||||||
|
"console = Console()\n",
|
||||||
|
"\n",
|
||||||
|
"console.print(\n",
|
||||||
|
" Panel(f\"{prompt}\".replace(\"{text}\", query), title=\"Prompt\", border_style=\"bold red\")\n",
|
||||||
|
")\n",
|
||||||
|
"console.print(\n",
|
||||||
|
" Panel(response.generated, title=\"Generated Content\", border_style=\"bold green\")\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/",
|
||||||
|
"height": 233
|
||||||
|
},
|
||||||
|
"id": "Dtju3oCiDOdD",
|
||||||
|
"outputId": "2f0f0cf8-0305-40cc-8409-07036c101938"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╭──────────────────────────────────────────────────── Prompt ─────────────────────────────────────────────────────╮</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span> Explain how a generative adversarial net works, using only the retrieved context. <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"\u001b[1;31m╭─\u001b[0m\u001b[1;31m───────────────────────────────────────────────────\u001b[0m\u001b[1;31m Prompt \u001b[0m\u001b[1;31m────────────────────────────────────────────────────\u001b[0m\u001b[1;31m─╮\u001b[0m\n",
|
||||||
|
"\u001b[1;31m│\u001b[0m Explain how a generative adversarial net works, using only the retrieved context. \u001b[1;31m│\u001b[0m\n",
|
||||||
|
"\u001b[1;31m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╭─────────────────────────────────────────────── Generated Content ───────────────────────────────────────────────╮</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> Generative Adversarial Nets (GANs) operate within an adversarial framework where two models are trained <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> simultaneously: a generative model (G) and a discriminative model (D). The generative model aims to capture the <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> data distribution and generate samples that mimic real data, while the discriminative model's task is to <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> distinguish between samples from the real data and those generated by G. This setup is akin to a game where the <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> generative model acts like counterfeiters trying to produce indistinguishable fake currency, and the <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> discriminative model acts like the police trying to detect these counterfeits. <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> The training process involves a minimax two-player game where G tries to maximize the probability of D making a <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> mistake, while D tries to minimize it. When both models are defined by multilayer perceptrons, they can be <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> trained using backpropagation without the need for Markov chains or approximate inference networks. The <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> ultimate goal is for G to perfectly replicate the training data distribution, making D's output equal to 1/2 <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> everywhere, indicating it cannot distinguish between real and generated data. This framework allows for <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> specific training algorithms and optimization techniques, such as backpropagation and dropout, to be <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span> effectively utilized. <span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">│</span>\n",
|
||||||
|
"<span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"\u001b[1;32m╭─\u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m Generated Content \u001b[0m\u001b[1;32m──────────────────────────────────────────────\u001b[0m\u001b[1;32m─╮\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m Generative Adversarial Nets (GANs) operate within an adversarial framework where two models are trained \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m simultaneously: a generative model (G) and a discriminative model (D). The generative model aims to capture the \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m data distribution and generate samples that mimic real data, while the discriminative model's task is to \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m distinguish between samples from the real data and those generated by G. This setup is akin to a game where the \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m generative model acts like counterfeiters trying to produce indistinguishable fake currency, and the \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m discriminative model acts like the police trying to detect these counterfeits. \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m The training process involves a minimax two-player game where G tries to maximize the probability of D making a \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m mistake, while D tries to minimize it. When both models are defined by multilayer perceptrons, they can be \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m trained using backpropagation without the need for Markov chains or approximate inference networks. The \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m ultimate goal is for G to perfectly replicate the training data distribution, making D's output equal to 1/2 \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m everywhere, indicating it cannot distinguish between real and generated data. This framework allows for \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m specific training algorithms and optimization techniques, such as backpropagation and dropout, to be \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m│\u001b[0m effectively utilized. \u001b[1;32m│\u001b[0m\n",
|
||||||
|
"\u001b[1;32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a prompt where context from the Weaviate collection will be injected\n",
|
||||||
|
"prompt = \"Explain how {text} works, using only the retrieved context.\"\n",
|
||||||
|
"query = \"a generative adversarial net\"\n",
|
||||||
|
"\n",
|
||||||
|
"response = collection.generate.near_text(\n",
|
||||||
|
" query=query, limit=3, grouped_task=prompt, return_properties=[\"text\", \"title\"]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Prettify the output using Rich\n",
|
||||||
|
"console = Console()\n",
|
||||||
|
"\n",
|
||||||
|
"console.print(\n",
|
||||||
|
" Panel(f\"{prompt}\".replace(\"{text}\", query), title=\"Prompt\", border_style=\"bold red\")\n",
|
||||||
|
")\n",
|
||||||
|
"console.print(\n",
|
||||||
|
" Panel(response.generated, title=\"Generated Content\", border_style=\"bold green\")\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "7tGz49nfUegG"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"accelerator": "GPU",
|
||||||
|
"colab": {
|
||||||
|
"gpuType": "T4",
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
@ -12,7 +12,17 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Hybrid RAG with Qdrant"
|
"# Retrieval with Qdrant"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"| Step | Tech | Execution | \n",
|
||||||
|
"| --- | --- | --- |\n",
|
||||||
|
"| Embedding | FastEmbed | 💻 Local |\n",
|
||||||
|
"| Vector store | Qdrant | 💻 Local |"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -47,22 +57,19 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\n",
|
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
|
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
|
||||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install --no-warn-conflicts -q qdrant-client docling docling-core fastembed"
|
"%pip install --no-warn-conflicts -q qdrant-client docling fastembed"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -74,13 +81,13 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling_core.transforms.chunker import HierarchicalChunker\n",
|
|
||||||
"from qdrant_client import QdrantClient\n",
|
"from qdrant_client import QdrantClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"from docling.chunking import HybridChunker\n",
|
||||||
"from docling.datamodel.base_models import InputFormat\n",
|
"from docling.datamodel.base_models import InputFormat\n",
|
||||||
"from docling.document_converter import DocumentConverter"
|
"from docling.document_converter import DocumentConverter"
|
||||||
]
|
]
|
||||||
@ -95,36 +102,16 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"name": "stderr",
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
"output_type": "stream",
|
||||||
"model_id": "c1077c6634d9434584c41cc12f9107c9",
|
"text": [
|
||||||
"version_major": 2,
|
"/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.12/site-packages/huggingface_hub/utils/tqdm.py:155: UserWarning: Cannot enable progress bars: environment variable `HF_HUB_DISABLE_PROGRESS_BARS=1` is set and has priority.\n",
|
||||||
"version_minor": 0
|
" warnings.warn(\n"
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
"Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]"
|
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "67069c07b73448d491944452159d10bc",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
"Fetching 29 files: 0%| | 0/29 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
@ -149,7 +136,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -157,7 +144,7 @@
|
|||||||
" \"https://www.sagacify.com/news/a-guide-to-chunking-strategies-for-retrieval-augmented-generation-rag\"\n",
|
" \"https://www.sagacify.com/news/a-guide-to-chunking-strategies-for-retrieval-augmented-generation-rag\"\n",
|
||||||
")\n",
|
")\n",
|
||||||
"documents, metadatas = [], []\n",
|
"documents, metadatas = [], []\n",
|
||||||
"for chunk in HierarchicalChunker().chunk(result.document):\n",
|
"for chunk in HybridChunker().chunk(result.document):\n",
|
||||||
" documents.append(chunk.text)\n",
|
" documents.append(chunk.text)\n",
|
||||||
" metadatas.append(chunk.meta.export_json_dict())"
|
" metadatas.append(chunk.meta.export_json_dict())"
|
||||||
]
|
]
|
||||||
@ -173,95 +160,119 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['e74ae15be5eb4805858307846318e784',\n",
|
|
||||||
" 'f83f6125b0fa4a0595ae6a0777c9d90d',\n",
|
|
||||||
" '9cf63c7f30764715bf3804a19db36d7d',\n",
|
|
||||||
" '007dbe6d355b4b49af3b736cbd63a4d8',\n",
|
|
||||||
" 'e5e31f21f2e84aa68beca0dfc532cbe9',\n",
|
|
||||||
" '69c10816af204bb28630a1f957d8dd3e',\n",
|
|
||||||
" 'b63546b9b1744063bdb076b234d883ca',\n",
|
|
||||||
" '90ad15ba8fa6494489e1d3221e30bfcf',\n",
|
|
||||||
" '13517debb483452ea40fc7aa04c08c50',\n",
|
|
||||||
" '84ccab5cfab74e27a55acef1c63e3fad',\n",
|
|
||||||
" 'e8aa2ef46d234c5a8a9da64b701d60b4',\n",
|
|
||||||
" '190bea5ba43c45e792197c50898d1d90',\n",
|
|
||||||
" 'a730319ea65645ca81e735ace0bcc72e',\n",
|
|
||||||
" '415e7f6f15864e30b836e23ae8d71b43',\n",
|
|
||||||
" '5569bce4e65541868c762d149c6f491e',\n",
|
|
||||||
" '74d9b234e9c04ebeb8e4e1ca625789ac',\n",
|
|
||||||
" '308b1c5006a94a679f4c8d6f2396993c',\n",
|
|
||||||
" 'aaa5ec6d385a418388e660c425bf1dbe',\n",
|
|
||||||
" '630be8e43e4e4472a9cdb9af9462a43a',\n",
|
|
||||||
" '643b316224de4770a5349bf69cf93471',\n",
|
|
||||||
" 'da9265e6f6c2485493d15223eefdf411',\n",
|
|
||||||
" 'a916e447d52c4084b5ce81a0c5a65b07',\n",
|
|
||||||
" '2883c620858e4e728b88e127155a4f2c',\n",
|
|
||||||
" '2a998f0e9c124af99027060b94027874',\n",
|
|
||||||
" 'be551fbd2b9e42f48ebae0cbf1f481bc',\n",
|
|
||||||
" '95b7f7608e974ca6847097ee4590fba1',\n",
|
|
||||||
" '309db4f3863b4e3aaf16d5f346c309f3',\n",
|
|
||||||
" 'c818383267f64fd68b2237b024bd724e',\n",
|
|
||||||
" '1f16e78338c94238892171b400051cd4',\n",
|
|
||||||
" '25c680c3e064462cab071ea9bf1bad8c',\n",
|
|
||||||
" 'f41ab7e480a248c6bb87019341c7ca74',\n",
|
|
||||||
" 'd440128bed6d4dcb987152b48ecd9a8a',\n",
|
|
||||||
" 'c110d5dfdc5849808851788c2404dd15']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"client.add(COLLECTION_NAME, documents=documents, metadata=metadatas, batch_size=64)"
|
"_ = client.add(\n",
|
||||||
|
" collection_name=COLLECTION_NAME,\n",
|
||||||
|
" documents=documents,\n",
|
||||||
|
" metadata=metadatas,\n",
|
||||||
|
" batch_size=64,\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Query Documents"
|
"## Retrieval"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"points = client.query(\n",
|
||||||
|
" collection_name=COLLECTION_NAME,\n",
|
||||||
|
" query_text=\"Can I split documents?\",\n",
|
||||||
|
" limit=10,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"<=== Retrieved documents ===>\n",
|
"=== 0 ===\n",
|
||||||
"Document Specific Chunking is a strategy that respects the document's structure. Rather than using a set number of characters or a recursive process, it creates chunks that align with the logical sections of the document, like paragraphs or subsections. This approach maintains the original author's organization of content and helps keep the text coherent. It makes the retrieved information more relevant and useful, particularly for structured documents with clearly defined sections.\n",
|
"Have you ever wondered how we, humans, would chunk? Here's a breakdown of a possible way a human would process a new document:\n",
|
||||||
"Document Specific Chunking can handle a variety of document formats, such as:\n",
|
|
||||||
"Consequently, there are also splitters available for this purpose.\n",
|
|
||||||
"1. We start at the top of the document, treating the first part as a chunk.\n",
|
"1. We start at the top of the document, treating the first part as a chunk.\n",
|
||||||
" 2. We continue down the document, deciding if a new sentence or piece of information belongs with the first chunk or should start a new one.\n",
|
" 2. We continue down the document, deciding if a new sentence or piece of information belongs with the first chunk or should start a new one.\n",
|
||||||
" 3. We keep this up until we reach the end of the document.\n",
|
" 3. We keep this up until we reach the end of the document.\n",
|
||||||
"Have you ever wondered how we, humans, would chunk? Here's a breakdown of a possible way a human would process a new document:\n",
|
"The ultimate dream? Having an agent do this for you. But slow down! This approach is still being tested and isn't quite ready for the big leagues due to the time it takes to process multiple LLM calls and the cost of those calls. There's no implementation available in public libraries just yet. However, Greg Kamradt has his version available here.\n",
|
||||||
"The goal of chunking is, as its name says, to chunk the information into multiple smaller pieces in order to store it in a more efficient and meaningful way. This allows the retrieval to capture pieces of information that are more related to the question at hand, and the generation to be more precise, but also less costly, as only a part of a document will be included in the LLM prompt, instead of the whole document.\n",
|
"\n",
|
||||||
"To put these strategies into action, there's a whole array of tools and libraries at your disposal. For example, llama_index is a fantastic tool that lets you create document indices and retrieve chunked documents. Let's not forget LangChain, another remarkable tool that makes implementing chunking strategies a breeze, particularly when dealing with multi-language data. Diving into these tools and understanding how they can work in harmony with the chunking strategies we've discussed is a crucial part of mastering Retrieval Augmented Generation.\n",
|
"=== 1 ===\n",
|
||||||
"Semantic chunking involves taking the embeddings of every sentence in the document, comparing the similarity of all sentences with each other, and then grouping sentences with the most similar embeddings together.\n",
|
"Document Specific Chunking is a strategy that respects the document's structure. Rather than using a set number of characters or a recursive process, it creates chunks that align with the logical sections of the document, like paragraphs or subsections. This approach maintains the original author's organization of content and helps keep the text coherent. It makes the retrieved information more relevant and useful, particularly for structured documents with clearly defined sections.\n",
|
||||||
|
"Document Specific Chunking can handle a variety of document formats, such as:\n",
|
||||||
|
"Markdown\n",
|
||||||
|
"HTML\n",
|
||||||
|
"Python\n",
|
||||||
|
"etc\n",
|
||||||
|
"Here we’ll take Markdown as our example and use a modified version of our first sample text:\n",
|
||||||
|
"\n",
|
||||||
|
"The result is the following:\n",
|
||||||
"You can see here that with a chunk size of 105, the Markdown structure of the document is taken into account, and the chunks thus preserve the semantics of the text!\n",
|
"You can see here that with a chunk size of 105, the Markdown structure of the document is taken into account, and the chunks thus preserve the semantics of the text!\n",
|
||||||
"And there you have it! These chunking strategies are like a personal toolbox when it comes to implementing Retrieval Augmented Generation. They're a ton of ways to slice and dice text, each with its unique features and quirks. This variety gives you the freedom to pick the strategy that suits your project best, allowing you to tailor your approach to perfectly fit the unique needs of your work.\n"
|
"\n",
|
||||||
|
"=== 2 ===\n",
|
||||||
|
"And there you have it! These chunking strategies are like a personal toolbox when it comes to implementing Retrieval Augmented Generation. They're a ton of ways to slice and dice text, each with its unique features and quirks. This variety gives you the freedom to pick the strategy that suits your project best, allowing you to tailor your approach to perfectly fit the unique needs of your work.\n",
|
||||||
|
"To put these strategies into action, there's a whole array of tools and libraries at your disposal. For example, llama_index is a fantastic tool that lets you create document indices and retrieve chunked documents. Let's not forget LangChain, another remarkable tool that makes implementing chunking strategies a breeze, particularly when dealing with multi-language data. Diving into these tools and understanding how they can work in harmony with the chunking strategies we've discussed is a crucial part of mastering Retrieval Augmented Generation.\n",
|
||||||
|
"By the way, if you're eager to experiment with your own examples using the chunking visualisation tool featured in this blog, feel free to give it a try! You can access it right here. Enjoy, and happy chunking! 😉\n",
|
||||||
|
"\n",
|
||||||
|
"=== 3 ===\n",
|
||||||
|
"Retrieval Augmented Generation (RAG) has been a hot topic in understanding, interpreting, and generating text with AI for the last few months. It's like a wonderful union of retrieval-based and generative models, creating a playground for researchers, data scientists, and natural language processing enthusiasts, like you and me.\n",
|
||||||
|
"To truly control the results produced by our RAG, we need to understand chunking strategies and their role in the process of retrieving and generating text. Indeed, each chunking strategy enhances RAG's effectiveness in its unique way.\n",
|
||||||
|
"The goal of chunking is, as its name says, to chunk the information into multiple smaller pieces in order to store it in a more efficient and meaningful way. This allows the retrieval to capture pieces of information that are more related to the question at hand, and the generation to be more precise, but also less costly, as only a part of a document will be included in the LLM prompt, instead of the whole document.\n",
|
||||||
|
"Let's explore some chunking strategies together.\n",
|
||||||
|
"The methods mentioned in the article you're about to read usually make use of two key parameters. First, we have [chunk_size]— which controls the size of your text chunks. Then there's [chunk_overlap], which takes care of how much text overlaps between one chunk and the next.\n",
|
||||||
|
"\n",
|
||||||
|
"=== 4 ===\n",
|
||||||
|
"Semantic Chunking considers the relationships within the text. It divides the text into meaningful, semantically complete chunks. This approach ensures the information's integrity during retrieval, leading to a more accurate and contextually appropriate outcome.\n",
|
||||||
|
"Semantic chunking involves taking the embeddings of every sentence in the document, comparing the similarity of all sentences with each other, and then grouping sentences with the most similar embeddings together.\n",
|
||||||
|
"By focusing on the text's meaning and context, Semantic Chunking significantly enhances the quality of retrieval. It's a top-notch choice when maintaining the semantic integrity of the text is vital.\n",
|
||||||
|
"However, this method does require more effort and is notably slower than the previous ones.\n",
|
||||||
|
"On our example text, since it is quite short and does not expose varied subjects, this method would only generate a single chunk.\n",
|
||||||
|
"\n",
|
||||||
|
"=== 5 ===\n",
|
||||||
|
"Language models used in the rest of your possible RAG pipeline have a token limit, which should not be exceeded. When dividing your text into chunks, it's advisable to count the number of tokens. Plenty of tokenizers are available. To ensure accuracy, use the same tokenizer for counting tokens as the one used in the language model.\n",
|
||||||
|
"Consequently, there are also splitters available for this purpose.\n",
|
||||||
|
"For instance, by using the [SpacyTextSplitter] from LangChain, the following chunks are created:\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"=== 6 ===\n",
|
||||||
|
"First things first, we have Character Chunking. This strategy divides the text into chunks based on a fixed number of characters. Its simplicity makes it a great starting point, but it can sometimes disrupt the text's flow, breaking sentences or words in unexpected places. Despite its limitations, it's a great stepping stone towards more advanced methods.\n",
|
||||||
|
"Now let’s see that in action with an example. Imagine a text that reads:\n",
|
||||||
|
"If we decide to set our chunk size to 100 and no chunk overlap, we'd end up with the following chunks. As you can see, Character Chunking can lead to some intriguing, albeit sometimes nonsensical, results, cutting some of the sentences in their middle.\n",
|
||||||
|
"By choosing a smaller chunk size, we would obtain more chunks, and by setting a bigger chunk overlap, we could obtain something like this:\n",
|
||||||
|
"\n",
|
||||||
|
"Also, by default this method creates chunks character by character based on the empty character [’ ’]. But you can specify a different one in order to chunk on something else, even a complete word! For instance, by specifying [' '] as the separator, you can avoid cutting words in their middle.\n",
|
||||||
|
"\n",
|
||||||
|
"=== 7 ===\n",
|
||||||
|
"Next, let's take a look at Recursive Character Chunking. Based on the basic concept of Character Chunking, this advanced version takes it up a notch by dividing the text into chunks until a certain condition is met, such as reaching a minimum chunk size. This method ensures that the chunking process aligns with the text's structure, preserving more meaning. Its adaptability makes Recursive Character Chunking great for texts with varied structures.\n",
|
||||||
|
"Again, let’s use the same example in order to illustrate this method. With a chunk size of 100, and the default settings for the other parameters, we obtain the following chunks:\n",
|
||||||
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"points = client.query(COLLECTION_NAME, query_text=\"Can I split documents?\", limit=10)\n",
|
"for i, point in enumerate(points):\n",
|
||||||
"\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
"print(\"<=== Retrieved documents ===>\")\n",
|
" print(point.document)\n",
|
||||||
"for point in points:\n",
|
" print()"
|
||||||
" print(point.document)"
|
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -280,7 +291,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.13.0"
|
"version": "3.12.7"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
75
docs/examples/translate.py
Normal file
75
docs/examples/translate.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
IMAGE_RESOLUTION_SCALE = 2.0
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME: put in your favorite translation code ....
|
||||||
|
def translate(text: str, src: str = "en", dest: str = "de"):
|
||||||
|
|
||||||
|
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
|
||||||
|
# from googletrans import Translator
|
||||||
|
|
||||||
|
# Initialize the translator
|
||||||
|
# translator = Translator()
|
||||||
|
|
||||||
|
# Translate text from English to German
|
||||||
|
# text = "Hello, how are you?"
|
||||||
|
# translated = translator.translate(text, src="en", dest="de")
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
|
output_dir = Path("scratch")
|
||||||
|
|
||||||
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
|
# will destroy them for cleaning up memory.
|
||||||
|
# This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
|
||||||
|
# scale=1 correspond of a standard 72 DPI image
|
||||||
|
# The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
|
||||||
|
# with the image field
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
pipeline_options.generate_page_images = True
|
||||||
|
pipeline_options.generate_picture_images = True
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
conv_res = doc_converter.convert(input_doc_path)
|
||||||
|
conv_doc = conv_res.document
|
||||||
|
|
||||||
|
# Save markdown with embedded pictures in original text
|
||||||
|
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
||||||
|
conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
|
||||||
|
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if isinstance(element, TextItem):
|
||||||
|
element.orig = element.text
|
||||||
|
element.text = translate(text=element.text)
|
||||||
|
|
||||||
|
elif isinstance(element, TableItem):
|
||||||
|
for cell in element.data.table_cells:
|
||||||
|
cell.text = translate(text=element.text)
|
||||||
|
|
||||||
|
# Save markdown with embedded pictures in translated text
|
||||||
|
md_filename = output_dir / f"{doc_filename}-with-images-translated.md"
|
||||||
|
conv_doc.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)
|
@ -21,7 +21,7 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|||||||
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
|
||||||
* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
|
* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
|
||||||
* 🧩 Unified, expressive [DoclingDocument](./concepts/docling_document.md) representation format
|
* 🧩 Unified, expressive [DoclingDocument](./concepts/docling_document.md) representation format
|
||||||
* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
|
* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
||||||
* 🔍 OCR support for scanned PDFs
|
* 🔍 OCR support for scanned PDFs
|
||||||
* 💻 Simple and convenient CLI
|
* 💻 Simple and convenient CLI
|
||||||
|
|
||||||
@ -29,7 +29,15 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|||||||
|
|
||||||
* ♾️ Equation & code extraction
|
* ♾️ Equation & code extraction
|
||||||
* 📝 Metadata extraction, including title, authors, references & language
|
* 📝 Metadata extraction, including title, authors, references & language
|
||||||
* 🦜🔗 Native LangChain extension
|
|
||||||
|
## Get started
|
||||||
|
|
||||||
|
<div class="grid">
|
||||||
|
<a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamendals</a>
|
||||||
|
<a href="examples/" class="card"><b>Examples</b><br />Try out recipes for various use cases, including conversion, RAG, and more</a>
|
||||||
|
<a href="integrations/" class="card"><b>Integrations</b><br />Check out integrations with popular frameworks and tools</a>
|
||||||
|
<a href="reference/document_converter/" class="card"><b>Reference</b><br />See more API details</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
## IBM ❤️ Open Source AI
|
## IBM ❤️ Open Source AI
|
||||||
|
|
||||||
|
10
docs/integrations/crewai.md
Normal file
10
docs/integrations/crewai.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
Docling is available in [CrewAI](https://www.crewai.com/) as the `CrewDoclingSource`
|
||||||
|
knowledge source.
|
||||||
|
|
||||||
|
- 💻 [Crew AI GitHub][github]
|
||||||
|
- 📖 [Crew AI knowledge docs][docs]
|
||||||
|
- 📦 [Crew AI PyPI][package]
|
||||||
|
|
||||||
|
[github]: https://github.com/crewAIInc/crewAI/
|
||||||
|
[docs]: https://docs.crewai.com/concepts/knowledge
|
||||||
|
[package]: https://pypi.org/project/crewai/
|
11
docs/integrations/haystack.md
Normal file
11
docs/integrations/haystack.md
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
Docling is available as a converter in [Haystack](https://haystack.deepset.ai/):
|
||||||
|
|
||||||
|
- 📖 [Docling Haystack integration docs][docs]
|
||||||
|
- 💻 [Docling Haystack integration GitHub][github]
|
||||||
|
- 🧑🏽🍳 [Docling Haystack integration example][example]
|
||||||
|
- 📦 [Docling Haystack integration PyPI][pypi]
|
||||||
|
|
||||||
|
[github]: https://github.com/DS4SD/docling-haystack
|
||||||
|
[docs]: https://haystack.deepset.ai/integrations/docling
|
||||||
|
[pypi]: https://pypi.org/project/docling-haystack
|
||||||
|
[example]: ../examples/rag_haystack.ipynb
|
14
docs/integrations/langchain.md
Normal file
14
docs/integrations/langchain.md
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
Docling is available as an official [LangChain](https://python.langchain.com/) extension.
|
||||||
|
|
||||||
|
To get started, check out the [step-by-step guide in LangChain][guide].
|
||||||
|
|
||||||
|
- 📖 [LangChain Docling integration docs][docs]
|
||||||
|
- 💻 [LangChain Docling integration GitHub][github]
|
||||||
|
- 🧑🏽🍳 [LangChain Docling integration example][example]
|
||||||
|
- 📦 [LangChain Docling integration PyPI][pypi]
|
||||||
|
|
||||||
|
[docs]: https://python.langchain.com/docs/integrations/providers/docling/
|
||||||
|
[github]: https://github.com/DS4SD/docling-langchain
|
||||||
|
[guide]: https://python.langchain.com/docs/integrations/document_loaders/docling/
|
||||||
|
[example]: ../examples/rag_langchain.ipynb
|
||||||
|
[pypi]: https://pypi.org/project/langchain-docling/
|
6
docs/integrations/nvidia.md
Normal file
6
docs/integrations/nvidia.md
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Docling is powering the NVIDIA *PDF to Podcast* agentic AI blueprint:
|
||||||
|
|
||||||
|
- [🏠 PDF to Podcast home](https://build.nvidia.com/nvidia/pdf-to-podcast)
|
||||||
|
- [💻 PDF to Podcast GitHub](https://github.com/NVIDIA-AI-Blueprints/pdf-to-podcast)
|
||||||
|
- [📣 PDF to Podcast announcement](https://nvidianews.nvidia.com/news/nvidia-launches-ai-foundation-models-for-rtx-ai-pcs)
|
||||||
|
- [✍️ PDF to Podcast blog post](https://blogs.nvidia.com/blog/agentic-ai-blueprints/)
|
5
docs/integrations/opencontracts.md
Normal file
5
docs/integrations/opencontracts.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
Docling is available an ingestion engine for [OpenContracts](https://github.com/JSv4/OpenContracts), allowing you to use Docling's OCR engine(s), chunker(s), labels, etc. and load them into a platform supporting bulk data extraction, text annotating, and question-answering:
|
||||||
|
|
||||||
|
- 💻 [OpenContracts GitHub](https://github.com/JSv4/OpenContracts)
|
||||||
|
- 📖 [OpenContracts Docs](https://jsv4.github.io/OpenContracts/)
|
||||||
|
- ▶️ [OpenContracts x Docling PDF annotation screen capture](https://github.com/JSv4/OpenContracts/blob/main/docs/assets/images/gifs/PDF%20Annotation%20Flow.gif)
|
@ -1,10 +1,8 @@
|
|||||||
Docling is powering document processing in [Red Hat Enterprise Linux AI][home] (RHEL AI),
|
Docling is powering document processing in [Red Hat Enterprise Linux AI (RHEL AI)](https://rhel.ai),
|
||||||
enabling users to unlock the knowledge hidden in documents and present it to
|
enabling users to unlock the knowledge hidden in documents and present it to
|
||||||
InstructLab's fine-tuning for aligning AI models to the user's specific data.
|
InstructLab's fine-tuning for aligning AI models to the user's specific data.
|
||||||
|
|
||||||
More details can be found in this [blog post][blog].
|
- 📣 [RHEL AI 1.3 announcement](https://www.redhat.com/en/about/press-releases/red-hat-delivers-next-wave-gen-ai-innovation-new-red-hat-enterprise-linux-ai-capabilities)
|
||||||
|
- ✍️ RHEL blog posts:
|
||||||
- 🏠 [RHEL AI home][home]
|
- [RHEL AI 1.3 Docling context aware chunking: What you need to know](https://www.redhat.com/en/blog/rhel-13-docling-context-aware-chunking-what-you-need-know)
|
||||||
|
- [Docling: The missing document processing companion for generative AI](https://www.redhat.com/en/blog/docling-missing-document-processing-companion-generative-ai)
|
||||||
[home]: https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux/ai
|
|
||||||
[blog]: https://www.redhat.com/en/blog/docling-missing-document-processing-companion-generative-ai
|
|
||||||
|
5
docs/integrations/vectara.md
Normal file
5
docs/integrations/vectara.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
Docling is available as a document parser in [Vectara](https://www.vectara.com/).
|
||||||
|
|
||||||
|
- 💻 [Vectara GitHub org](https://github.com/vectara)
|
||||||
|
- [vectara-ingest GitHub repo](https://github.com/vectara/vectara-ingest)
|
||||||
|
- 📖 [Vectara docs](https://docs.vectara.com/)
|
@ -32,6 +32,7 @@ This is an automatic generated API reference of the DoclingDocument type.
|
|||||||
- CoordOrigin
|
- CoordOrigin
|
||||||
- ImageRefMode
|
- ImageRefMode
|
||||||
- Size
|
- Size
|
||||||
|
docstring_style: sphinx
|
||||||
show_if_no_docstring: true
|
show_if_no_docstring: true
|
||||||
show_submodules: true
|
show_submodules: true
|
||||||
docstring_section_style: list
|
docstring_section_style: list
|
||||||
|
@ -95,8 +95,8 @@ doc_converter = (
|
|||||||
|
|
||||||
More options are shown in the following example units:
|
More options are shown in the following example units:
|
||||||
|
|
||||||
- [run_with_formats.py](../examples/run_with_formats/)
|
- [run_with_formats.py](examples/run_with_formats.py)
|
||||||
- [custom_convert.py](../examples/custom_convert/)
|
- [custom_convert.py](examples/custom_convert.py)
|
||||||
|
|
||||||
### Converting documents
|
### Converting documents
|
||||||
|
|
||||||
@ -226,4 +226,4 @@ leverages the new `DoclingDocument` and provides a new, richer chunk output form
|
|||||||
- any applicable headings for context
|
- any applicable headings for context
|
||||||
- any applicable captions for context
|
- any applicable captions for context
|
||||||
|
|
||||||
For an example, check out [Chunking usage](../usage/#chunking).
|
For an example, check out [Chunking usage](usage.md#chunking).
|
||||||
|
47
mkdocs.yml
47
mkdocs.yml
@ -65,7 +65,7 @@ nav:
|
|||||||
- Chunking: concepts/chunking.md
|
- Chunking: concepts/chunking.md
|
||||||
- Examples:
|
- Examples:
|
||||||
- Examples: examples/index.md
|
- Examples: examples/index.md
|
||||||
- Conversion:
|
- 🔀 Conversion:
|
||||||
- "Simple conversion": examples/minimal.py
|
- "Simple conversion": examples/minimal.py
|
||||||
- "Custom conversion": examples/custom_convert.py
|
- "Custom conversion": examples/custom_convert.py
|
||||||
- "Batch conversion": examples/batch_convert.py
|
- "Batch conversion": examples/batch_convert.py
|
||||||
@ -76,27 +76,38 @@ nav:
|
|||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
- "Force full page OCR": examples/full_page_ocr.py
|
- "Force full page OCR": examples/full_page_ocr.py
|
||||||
- "Accelerator options": examples/run_with_accelerator.py
|
- "Accelerator options": examples/run_with_accelerator.py
|
||||||
- Chunking:
|
- "Simple translation": examples/translate.py
|
||||||
|
- ✂️ Chunking:
|
||||||
- "Hybrid chunking": examples/hybrid_chunking.ipynb
|
- "Hybrid chunking": examples/hybrid_chunking.ipynb
|
||||||
- RAG / QA:
|
- 💬 RAG / QA:
|
||||||
- "RAG with Haystack": examples/rag_haystack.ipynb
|
- examples/rag_haystack.ipynb
|
||||||
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
- examples/rag_llamaindex.ipynb
|
||||||
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
- examples/rag_langchain.ipynb
|
||||||
- "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
|
- examples/rag_weaviate.ipynb
|
||||||
|
- RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb
|
||||||
|
- examples/retrieval_qdrant.ipynb
|
||||||
- Integrations:
|
- Integrations:
|
||||||
- Integrations: integrations/index.md
|
- Integrations: integrations/index.md
|
||||||
- "🐝 Bee": integrations/bee.md
|
- 🤖 Agentic / AI dev frameworks:
|
||||||
- "Cloudera": integrations/cloudera.md
|
- "Bee Agent Framework": integrations/bee.md
|
||||||
- "Data Prep Kit": integrations/data_prep_kit.md
|
- "Crew AI": integrations/crewai.md
|
||||||
- "DocETL": integrations/docetl.md
|
- "Haystack": integrations/haystack.md
|
||||||
- "🐶 InstructLab": integrations/instructlab.md
|
- "LangChain": integrations/langchain.md
|
||||||
- "Kotaemon": integrations/kotaemon.md
|
- "LlamaIndex": integrations/llamaindex.md
|
||||||
- "🦙 LlamaIndex": integrations/llamaindex.md
|
|
||||||
- "Prodigy": integrations/prodigy.md
|
|
||||||
- "Red Hat Enterprise Linux AI": integrations/rhel_ai.md
|
|
||||||
- "spaCy": integrations/spacy.md
|
|
||||||
- "txtai": integrations/txtai.md
|
- "txtai": integrations/txtai.md
|
||||||
# - "LangChain 🦜🔗": integrations/langchain.md
|
- ⭐️ Featured:
|
||||||
|
- "Data Prep Kit": integrations/data_prep_kit.md
|
||||||
|
- "InstructLab": integrations/instructlab.md
|
||||||
|
- "NVIDIA": integrations/nvidia.md
|
||||||
|
- "Prodigy": integrations/prodigy.md
|
||||||
|
- "RHEL AI": integrations/rhel_ai.md
|
||||||
|
- "spaCy": integrations/spacy.md
|
||||||
|
- 🗂️ More integrations:
|
||||||
|
- "Cloudera": integrations/cloudera.md
|
||||||
|
- "DocETL": integrations/docetl.md
|
||||||
|
- "Kotaemon": integrations/kotaemon.md
|
||||||
|
- "OpenContracts": integrations/opencontracts.md
|
||||||
|
- "Vectara": integrations/vectara.md
|
||||||
- Reference:
|
- Reference:
|
||||||
- Python API:
|
- Python API:
|
||||||
- Document Converter: reference/document_converter.md
|
- Document Converter: reference/document_converter.md
|
||||||
|
2314
poetry.lock
generated
2314
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.14.0" # DO NOT EDIT, updated automatically
|
version = "2.15.1" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
@ -25,16 +25,16 @@ packages = [{include = "docling"}]
|
|||||||
# actual dependencies:
|
# actual dependencies:
|
||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
docling-core = { version = "^2.12.1", extras = ["chunking"] }
|
docling-core = { version = "^2.13.1", extras = ["chunking"] }
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-ibm-models = "^3.1.0"
|
docling-ibm-models = "^3.1.0"
|
||||||
deepsearch-glm = "^1.0.0"
|
deepsearch-glm = "^1.0.0"
|
||||||
docling-parse = "^3.0.0"
|
docling-parse = "^3.1.0"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
pypdfium2 = "^4.30.0"
|
pypdfium2 = "^4.30.0"
|
||||||
pydantic-settings = "^2.3.0"
|
pydantic-settings = "^2.3.0"
|
||||||
huggingface_hub = ">=0.23,<1"
|
huggingface_hub = ">=0.23,<1"
|
||||||
requests = "^2.32.3"
|
requests = "^2.32.2"
|
||||||
easyocr = "^1.7"
|
easyocr = "^1.7"
|
||||||
tesserocr = { version = "^2.7.1", optional = true }
|
tesserocr = { version = "^2.7.1", optional = true }
|
||||||
certifi = ">=2024.7.4"
|
certifi = ">=2024.7.4"
|
||||||
|
Loading…
Reference in New Issue
Block a user