Merge remote-tracking branch 'origin/main' into dev/add-split-page-html-export

This commit is contained in:
Michele Dolfi 2025-04-11 13:06:00 +02:00
commit e803967831
18 changed files with 858 additions and 215 deletions

View File

@ -37,7 +37,7 @@ jobs:
run: |
for file in docs/examples/*.py; do
# Skip batch_convert.py
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
echo "Skipping $file"
continue
fi

View File

@ -1,3 +1,22 @@
## [v2.29.0](https://github.com/docling-project/docling/releases/tag/v2.29.0) - 2025-04-10
### Feature
* Handle <code> tags as code blocks ([#1320](https://github.com/docling-project/docling/issues/1320)) ([`0499cd1`](https://github.com/docling-project/docling/commit/0499cd1c1e93f74260754476a8423059915f59c2))
* **docx:** Add text formatting and hyperlink support ([#630](https://github.com/docling-project/docling/issues/630)) ([`bfcab3d`](https://github.com/docling-project/docling/commit/bfcab3d6778e6f622bb4a6b241bdb4bab22ba378))
### Fix
* **docx:** Adding new latex symbols, simplifying how equations are added to text ([#1295](https://github.com/docling-project/docling/issues/1295)) ([`14e9c0c`](https://github.com/docling-project/docling/commit/14e9c0ce9a7559fac96ba5ed82befa12a7f53bfa))
* **pptx:** Check if picture shape has an image attached ([#1316](https://github.com/docling-project/docling/issues/1316)) ([`dc3bf9c`](https://github.com/docling-project/docling/commit/dc3bf9ceacb7048a97ceb8b7aa80bfccc8a05ca5))
* **docx:** Improve text parsing ([#1268](https://github.com/docling-project/docling/issues/1268)) ([`d2d6874`](https://github.com/docling-project/docling/commit/d2d68747f9c31be897f3e63c160c835086d37014))
* Tesseract OCR CLI can't process images composed with numbers only ([#1201](https://github.com/docling-project/docling/issues/1201)) ([`b3d111a`](https://github.com/docling-project/docling/commit/b3d111a3cdb90b653ddaaa356f9299e9cd39b340))
### Documentation
* Add plugins docs ([#1319](https://github.com/docling-project/docling/issues/1319)) ([`2e99e5a`](https://github.com/docling-project/docling/commit/2e99e5a54fafd901d8f26b56b25bb006c0e8e8b0))
* Add visual grounding example ([#1270](https://github.com/docling-project/docling/issues/1270)) ([`71148eb`](https://github.com/docling-project/docling/commit/71148eb381747a6b899c84b72946ba9bde665a40))
## [v2.28.4](https://github.com/docling-project/docling/releases/tag/v2.28.4) - 2025-03-29
### Fix

View File

@ -1,36 +1,50 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Dict, Set, Tuple, Union
from typing import Any, Union, cast
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItem,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ImageRef,
ProvenanceItem,
Size,
TableCell,
TableData,
)
# from lxml import etree
from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import Cell
from openpyxl import load_workbook
from openpyxl.drawing.image import Image
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image as PILImage
from pydantic import BaseModel, NonNegativeInt, PositiveInt
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
from typing import Any, List
from PIL import Image as PILImage
from pydantic import BaseModel
class ExcelCell(BaseModel):
"""Represents an Excel cell.
Attributes:
row: The row number of the cell.
col: The column number of the cell.
text: The text content of the cell.
row_span: The number of rows the cell spans.
col_span: The number of columns the cell spans.
"""
row: int
col: int
text: str
@ -39,19 +53,57 @@ class ExcelCell(BaseModel):
class ExcelTable(BaseModel):
"""Represents an Excel table on a worksheet.
Attributes:
anchor: The column and row indices of the upper-left cell of the table
(0-based index).
num_rows: The number of rows in the table.
num_cols: The number of columns in the table.
data: The data in the table, represented as a list of ExcelCell objects.
"""
anchor: tuple[NonNegativeInt, NonNegativeInt]
num_rows: int
num_cols: int
data: List[ExcelCell]
data: list[ExcelCell]
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
"""Backend for parsing Excel workbooks.
The backend converts an Excel workbook into a DoclingDocument object.
Each worksheet is converted into a separate page.
The following elements are parsed:
- Cell contents, parsed as tables. If two groups of cells are disconnected
between each other, they will be parsed as two different tables.
- Images, parsed as PictureItem objects.
The DoclingDocument tables and pictures have their provenance information, including
the position in their original Excel worksheet. The position is represented by a
bounding box object with the cell indices as units (0-based index). The size of this
bounding box is the number of columns and rows that the table or picture spans.
"""
@override
def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) -> None:
"""Initialize the MsExcelDocumentBackend object.
Parameters:
in_doc: The input document object.
path_or_stream: The path or stream to the Excel file.
Raises:
RuntimeError: An error occurred parsing the file.
"""
super().__init__(in_doc, path_or_stream)
# Initialise the parents for the hierarchy
self.max_levels = 10
self.parents: Dict[int, Any] = {}
self.parents: dict[int, Any] = {}
for i in range(-1, self.max_levels):
self.parents[i] = None
@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(self.path_or_stream, Path):
self.workbook = load_workbook(filename=str(self.path_or_stream))
self.valid = True
self.valid = self.workbook is not None
except Exception as e:
self.valid = False
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
) from e
@override
def is_valid(self) -> bool:
_log.info(f"valid: {self.valid}")
_log.debug(f"valid: {self.valid}")
return self.valid
@classmethod
@override
def supports_pagination(cls) -> bool:
return True
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@override
def page_count(self) -> int:
if self.is_valid() and self.workbook:
return len(self.workbook.sheetnames)
else:
return 0
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.XLSX}
@override
def convert(self) -> DoclingDocument:
# Parses the XLSX into a structured document model.
"""Parse the Excel workbook into a DoclingDocument object.
Raises:
RuntimeError: Unable to run the conversion since the backend object failed to
initialize.
Returns:
The DoclingDocument object representing the Excel workbook.
"""
origin = DocumentOrigin(
filename=self.file.name or "file.xlsx",
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@ -110,6 +174,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return doc
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
"""Parse the Excel workbook and attach its structure to a DoclingDocument.
Args:
doc: A DoclingDocument object.
Returns:
A DoclingDocument object with the parsed items.
"""
if self.workbook is not None:
@ -117,22 +189,34 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}")
# Access the sheet by name
sheet = self.workbook[sheet_name]
page_no = self.workbook.index(sheet) + 1
# do not rely on sheet.max_column, sheet.max_row if there are images
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
self.parents[0] = doc.add_group(
parent=None,
label=GroupLabel.SECTION,
name=f"sheet: {sheet_name}",
)
doc = self._convert_sheet(doc, sheet)
width, height = self._find_page_size(doc, page_no)
page.size = Size(width=width, height=height)
else:
_log.error("Workbook is not initialized.")
return doc
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
Args:
doc: The DoclingDocument to be updated.
sheet: The Excel worksheet to be parsed.
Returns:
The updated DoclingDocument.
"""
doc = self._find_tables_in_sheet(doc, sheet)
@ -140,11 +224,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return doc
def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
def _find_tables_in_sheet(
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:
"""Find all tables in an Excel sheet and attach them to a DoclingDocument.
Args:
doc: The DoclingDocument to be updated.
sheet: The Excel worksheet to be parsed.
Returns:
The updated DoclingDocument.
"""
if self.workbook is not None:
tables = self._find_data_tables(sheet)
for excel_table in tables:
origin_col = excel_table.anchor[0]
origin_row = excel_table.anchor[1]
num_rows = excel_table.num_rows
num_cols = excel_table.num_cols
@ -169,18 +267,38 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
)
table_data.table_cells.append(cell)
doc.add_table(data=table_data, parent=self.parents[0])
page_no = self.workbook.index(sheet) + 1
doc.add_table(
data=table_data,
parent=self.parents[0],
prov=ProvenanceItem(
page_no=page_no,
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
(
origin_col,
origin_row,
origin_col + num_cols,
origin_row + num_rows,
),
origin=CoordOrigin.TOPLEFT,
),
),
)
return doc
def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
"""
Find all compact rectangular data tables in a sheet.
"""
# _log.info("find_data_tables")
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
"""Find all compact rectangular data tables in an Excel worksheet.
tables = [] # List to store found tables
visited: set[Tuple[int, int]] = set() # Track already visited cells
Args:
sheet: The Excel worksheet to be parsed.
Returns:
A list of ExcelTable objects representing the data tables.
"""
tables: list[ExcelTable] = [] # List to store found tables
visited: set[tuple[int, int]] = set() # Track already visited cells
# Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
@ -191,9 +309,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
continue
# If the cell starts a new table, find its bounds
table_bounds, visited_cells = self._find_table_bounds(
sheet, ri, rj, visited
)
table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
visited.update(visited_cells) # Mark these cells as visited
tables.append(table_bounds)
@ -205,22 +321,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
sheet: Worksheet,
start_row: int,
start_col: int,
visited: set[Tuple[int, int]],
):
"""
Determine the bounds of a compact rectangular table.
) -> tuple[ExcelTable, set[tuple[int, int]]]:
"""Determine the bounds of a compact rectangular table.
Args:
sheet: The Excel worksheet to be parsed.
start_row: The row number of the starting cell.
start_col: The column number of the starting cell.
Returns:
- A dictionary with the bounds and data.
- A set of visited cell coordinates.
A tuple with an Excel table and a set of cell coordinates.
"""
_log.info("find_table_bounds")
_log.debug("find_table_bounds")
max_row = self._find_table_bottom(sheet, start_row, start_col)
max_col = self._find_table_right(sheet, start_row, start_col)
# Collect the data within the bounds
data = []
visited_cells = set()
visited_cells: set[tuple[int, int]] = set()
for ri in range(start_row, max_row + 1):
for rj in range(start_col, max_col + 1):
@ -230,7 +349,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
row_span = 1
col_span = 1
# _log.info(sheet.merged_cells.ranges)
for merged_range in sheet.merged_cells.ranges:
if (
@ -254,7 +372,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
col_span=col_span,
)
)
# _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
# Mark all cells in the span as visited
for span_row in range(ri, ri + row_span):
@ -263,6 +380,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return (
ExcelTable(
anchor=(start_col, start_row),
num_rows=max_row + 1 - start_row,
num_cols=max_col + 1 - start_col,
data=data,
@ -270,10 +388,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
visited_cells,
)
def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
"""Function to find the bottom boundary of the table"""
def _find_table_bottom(
self, sheet: Worksheet, start_row: int, start_col: int
) -> int:
"""Find the bottom boundary of a table.
max_row = start_row
Args:
sheet: The Excel worksheet to be parsed.
start_row: The starting row of the table.
start_col: The starting column of the table.
Returns:
The row index representing the bottom boundary of the table.
"""
max_row: int = start_row
while max_row < sheet.max_row - 1:
# Get the cell value or check if it is part of a merged cell
@ -296,10 +424,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
return max_row
def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
"""Function to find the right boundary of the table"""
def _find_table_right(
self, sheet: Worksheet, start_row: int, start_col: int
) -> int:
"""Find the right boundary of a table.
max_col = start_col
Args:
sheet: The Excel worksheet to be parsed.
start_row: The starting row of the table.
start_col: The starting column of the table.
Returns:
The column index representing the right boundary of the table."
"""
max_col: int = start_col
while max_col < sheet.max_column - 1:
# Get the cell value or check if it is part of a merged cell
@ -325,19 +463,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
def _find_images_in_sheet(
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:
"""Find images in the Excel sheet and attach them to the DoclingDocument.
Args:
doc: The DoclingDocument to be updated.
sheet: The Excel worksheet to be parsed.
Returns:
The updated DoclingDocument.
"""
if self.workbook is not None:
# Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore
for item in sheet._images: # type: ignore[attr-defined]
try:
pil_image = PILImage.open(image.ref)
image: Image = cast(Image, item)
pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
page_no = self.workbook.index(sheet) + 1
anchor = (0, 0, 0, 0)
if isinstance(image.anchor, TwoCellAnchor):
anchor = (
image.anchor._from.col,
image.anchor._from.row,
image.anchor.to.col + 1,
image.anchor.to.row + 1,
)
doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
prov=ProvenanceItem(
page_no=page_no,
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
anchor, origin=CoordOrigin.TOPLEFT
),
),
)
except:
_log.error("could not extract the image from excel sheets")
return doc
@staticmethod
def _find_page_size(
doc: DoclingDocument, page_no: PositiveInt
) -> tuple[float, float]:
left: float = -1.0
top: float = -1.0
right: float = -1.0
bottom: float = -1.0
for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
if not isinstance(item, DocItem):
continue
for provenance in item.prov:
bbox = provenance.bbox
left = min(left, bbox.l) if left != -1 else bbox.l
right = max(right, bbox.r) if right != -1 else bbox.r
top = min(top, bbox.t) if top != -1 else bbox.t
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
return (right - left, bottom - top)

View File

@ -850,7 +850,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> None:
def get_docx_image(drawing_blip):
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
image_data: Optional[bytes] = None
rId = drawing_blip[0].get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
@ -862,8 +863,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
level = self._get_level()
# Open the BytesIO object with PIL to create an Image
image_data: Optional[bytes] = get_docx_image(drawing_blip)
if image_data is None:
_log.warning("Warning: image cannot be found")
doc.add_picture(
parent=self.parents[level - 1],
caption=None,
)
else:
try:
image_data = get_docx_image(drawing_blip)
image_bytes = BytesIO(image_data)
pil_image = Image.open(image_bytes)
doc.add_picture(

View File

@ -40,6 +40,7 @@ from docling.datamodel.pipeline_options import (
VlmModelType,
VlmPipelineOptions,
granite_vision_vlm_conversion_options,
granite_vision_vlm_ollama_conversion_options,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
)
@ -541,10 +542,16 @@ def convert(
backend=backend, # pdf_backend
)
elif pipeline == PdfPipeline.VLM:
pipeline_options = VlmPipelineOptions()
pipeline_options = VlmPipelineOptions(
enable_remote_services=enable_remote_services,
)
if vlm_model == VlmModelType.GRANITE_VISION:
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
pipeline_options.vlm_options = (
granite_vision_vlm_ollama_conversion_options
)
elif vlm_model == VlmModelType.SMOLDOCLING:
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
if sys.platform == "darwin":

View File

@ -263,3 +263,35 @@ class Page(BaseModel):
@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)
## OpenAI API Request / Response Models ##
class OpenAiChatMessage(BaseModel):
role: str
content: str
class OpenAiResponseChoice(BaseModel):
index: int
message: OpenAiChatMessage
finish_reason: str
class OpenAiResponseUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class OpenAiApiResponse(BaseModel):
model_config = ConfigDict(
protected_namespaces=(),
)
id: str
model: Optional[str] = None # returned by openai
choices: List[OpenAiResponseChoice]
created: int
usage: OpenAiResponseUsage

View File

@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions):
batch_size: int = 8
scale: float = 2
bitmap_area_threshold: float = (
0.2 # percentage of the area for a bitmap to processed with the models
picture_area_threshold: float = (
0.05 # percentage of the area for a picture to processed with the models
)
@ -266,6 +266,7 @@ class ResponseFormat(str, Enum):
class InferenceFramework(str, Enum):
MLX = "mlx"
TRANSFORMERS = "transformers"
OPENAI = "openai"
class HuggingFaceVlmOptions(BaseVlmOptions):
@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
return self.repo_id.replace("/", "--")
class ApiVlmOptions(BaseVlmOptions):
kind: Literal["api_model_options"] = "api_model_options"
url: AnyUrl = AnyUrl(
"http://localhost:11434/v1/chat/completions"
) # Default to ollama
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
scale: float = 2.0
timeout: float = 60
response_format: ResponseFormat
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
prompt="Convert this page to docling.",
@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
inference_framework=InferenceFramework.TRANSFORMERS,
)
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
params={"model": "granite3.2-vision:2b"},
prompt="OCR the full page to markdown.",
scale=1.0,
timeout=120,
response_format=ResponseFormat.MARKDOWN,
)
class VlmModelType(str, Enum):
SMOLDOCLING = "smoldocling"
GRANITE_VISION = "granite_vision"
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
# Define an enum for the backend options
@ -362,7 +386,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
smoldocling_vlm_conversion_options
)
class PdfPipelineOptions(PaginatedPipelineOptions):

View File

@ -0,0 +1,67 @@
from typing import Iterable
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ApiVlmOptions
from docling.exceptions import OperationNotAllowed
from docling.models.base_model import BasePageModel
from docling.utils.api_image_request import api_image_request
from docling.utils.profiling import TimeRecorder
class ApiVlmModel(BasePageModel):
def __init__(
self,
enabled: bool,
enable_remote_services: bool,
vlm_options: ApiVlmOptions,
):
self.enabled = enabled
self.vlm_options = vlm_options
if self.enabled:
if not enable_remote_services:
raise OperationNotAllowed(
"Connections to remote services is only allowed when set explicitly. "
"pipeline_options.enable_remote_services=True, or using the CLI "
"--enable-remote-services."
)
self.timeout = self.vlm_options.timeout
self.prompt_content = (
f"This is a page from a document.\n{self.vlm_options.prompt}"
)
self.params = {
**self.vlm_options.params,
"temperature": 0,
}
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "vlm"):
assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale)
assert hi_res_image is not None
if hi_res_image:
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")
page_tags = api_image_request(
image=hi_res_image,
prompt=self.prompt_content,
url=self.vlm_options.url,
timeout=self.timeout,
headers=self.vlm_options.headers,
**self.params,
)
page.predictions.vlm_response = VlmPrediction(text=page_tags)
yield page

View File

@ -1,12 +1,7 @@
import base64
import io
import logging
from pathlib import Path
from typing import Iterable, List, Optional, Type, Union
from typing import Iterable, Optional, Type, Union
import requests
from PIL import Image
from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import (
AcceleratorOptions,
@ -15,37 +10,7 @@ from docling.datamodel.pipeline_options import (
)
from docling.exceptions import OperationNotAllowed
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
_log = logging.getLogger(__name__)
class ChatMessage(BaseModel):
role: str
content: str
class ResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: str
class ResponseUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ApiResponse(BaseModel):
model_config = ConfigDict(
protected_namespaces=(),
)
id: str
model: Optional[str] = None # returned by openai
choices: List[ResponseChoice]
created: int
usage: ResponseUsage
from docling.utils.api_image_request import api_image_request
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
@ -83,43 +48,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# Note: technically we could make a batch request here,
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
for image in images:
img_io = io.BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": self.options.prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
],
}
]
payload = {
"messages": messages,
**self.options.params,
}
r = requests.post(
str(self.options.url),
headers=self.options.headers,
json=payload,
yield api_image_request(
image=image,
prompt=self.options.prompt,
url=self.options.url,
timeout=self.options.timeout,
headers=self.options.headers,
**self.options.params,
)
if not r.ok:
_log.error(f"Error calling the API. Reponse was {r.text}")
r.raise_for_status()
api_resp = ApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
yield generated_text

View File

@ -63,6 +63,18 @@ class PictureDescriptionBaseModel(
elements: List[PictureItem] = []
for el in element_batch:
assert isinstance(el.item, PictureItem)
describe_image = True
# Don't describe the image if it's smaller than the threshold
if len(el.item.prov) > 0:
prov = el.item.prov[0] # PictureItems have at most a single provenance
page = doc.pages.get(prov.page_no)
if page is not None:
page_area = page.size.width * page.size.height
if page_area > 0:
area_fraction = prov.bbox.area() / page_area
if area_fraction < self.options.picture_area_threshold:
describe_image = False
if describe_image:
elements.append(el.item)
images.append(el.image)

View File

@ -2,7 +2,7 @@ import logging
import sys
import warnings
from pathlib import Path
from typing import Optional
from typing import Optional, cast
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
@ -226,7 +226,11 @@ class StandardPdfPipeline(PaginatedPipeline):
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
page = conv_res.pages[page_ix]
page = next(
(p for p in conv_res.pages if p.page_no == page_ix),
cast("Page", None),
)
assert page is not None
assert page.size is not None
assert page.image is not None

View File

@ -15,11 +15,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
ApiVlmOptions,
HuggingFaceVlmOptions,
InferenceFramework,
ResponseFormat,
VlmPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.models.api_vlm_model import ApiVlmModel
from docling.models.hf_mlx_model import HuggingFaceMlxModel
from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.pipeline.base_pipeline import PaginatedPipeline
@ -57,16 +60,23 @@ class VlmPipeline(PaginatedPipeline):
self.keep_images = self.pipeline_options.generate_page_images
if (
self.pipeline_options.vlm_options.inference_framework
== InferenceFramework.MLX
):
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
self.build_pipe = [
ApiVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
enable_remote_services=self.pipeline_options.enable_remote_services,
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
),
]
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
if vlm_options.inference_framework == InferenceFramework.MLX:
self.build_pipe = [
HuggingFaceMlxModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
vlm_options=vlm_options,
),
]
else:
@ -75,7 +85,7 @@ class VlmPipeline(PaginatedPipeline):
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
vlm_options=vlm_options,
),
]

View File

@ -0,0 +1,61 @@
import base64
import logging
from io import BytesIO
from typing import Dict, Optional
import requests
from PIL import Image
from pydantic import AnyUrl
from docling.datamodel.base_models import OpenAiApiResponse
_log = logging.getLogger(__name__)
def api_image_request(
image: Image.Image,
prompt: str,
url: AnyUrl,
timeout: float = 20,
headers: Optional[Dict[str, str]] = None,
**params,
) -> str:
img_io = BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
{
"type": "text",
"text": prompt,
},
],
}
]
payload = {
"messages": messages,
**params,
}
headers = headers or {}
r = requests.post(
str(url),
headers=headers,
json=payload,
timeout=timeout,
)
if not r.ok:
_log.error(f"Error calling the API. Response was {r.text}")
r.raise_for_status()
api_resp = OpenAiApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
return generated_text

View File

@ -0,0 +1,111 @@
import logging
import os
from pathlib import Path
import requests
from dotenv import load_dotenv
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
ApiVlmOptions,
ResponseFormat,
VlmPipelineOptions,
granite_vision_vlm_ollama_conversion_options,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
def ollama_vlm_options(model: str, prompt: str):
options = ApiVlmOptions(
url="http://localhost:11434/v1/chat/completions", # the default Ollama endpoint
params=dict(
model=model,
),
prompt=prompt,
timeout=90,
scale=1.0,
response_format=ResponseFormat.MARKDOWN,
)
return options
def watsonx_vlm_options(model: str, prompt: str):
load_dotenv()
api_key = os.environ.get("WX_API_KEY")
project_id = os.environ.get("WX_PROJECT_ID")
def _get_iam_access_token(api_key: str) -> str:
res = requests.post(
url="https://iam.cloud.ibm.com/identity/token",
headers={
"Content-Type": "application/x-www-form-urlencoded",
},
data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
)
res.raise_for_status()
api_out = res.json()
print(f"{api_out=}")
return api_out["access_token"]
options = ApiVlmOptions(
url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
params=dict(
model_id=model,
project_id=project_id,
parameters=dict(
max_new_tokens=400,
),
),
headers={
"Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
},
prompt=prompt,
timeout=60,
response_format=ResponseFormat.MARKDOWN,
)
return options
def main():
logging.basicConfig(level=logging.INFO)
# input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
pipeline_options = VlmPipelineOptions(
enable_remote_services=True # <-- this is required!
)
# The ApiVlmOptions() allows to interface with APIs supporting
# the multi-modal chat interface. Here follow a few example on how to configure those.
# One possibility is self-hosting model, e.g. via Ollama.
# Example using the Granite Vision model: (uncomment the following lines)
pipeline_options.vlm_options = ollama_vlm_options(
model="granite3.2-vision:2b",
prompt="OCR the full page to markdown.",
)
# Another possibility is using online services, e.g. watsonx.ai.
# Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
# Uncomment the following line for this option:
# pipeline_options.vlm_options = watsonx_vlm_options(
# model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown."
# )
# Create the DocumentConverter and launch the conversion.
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
pipeline_cls=VlmPipeline,
)
}
)
result = doc_converter.convert(input_doc_path)
print(result.document.export_to_markdown())
if __name__ == "__main__":
main()

View File

@ -75,6 +75,8 @@ nav:
- "Custom conversion": examples/custom_convert.py
- "Batch conversion": examples/batch_convert.py
- "Multi-format conversion": examples/run_with_formats.py
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
- "Figure export": examples/export_figures.py
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
version = "2.28.4" # DO NOT EDIT, updated automatically
version = "2.29.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = [
"Christoph Auer <cau@zurich.ibm.com>",

View File

@ -97,7 +97,22 @@
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"prov": [
{
"page_no": 3,
"bbox": {
"l": 8.0,
"t": 18.0,
"r": 13.0,
"b": 36.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -122,7 +137,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 1,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 3.0,
"b": 7.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -661,7 +691,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 2,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 4.0,
"b": 9.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -1564,7 +1609,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 2,
"bbox": {
"l": 6.0,
"t": 4.0,
"r": 9.0,
"b": 9.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -1955,7 +2015,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 2,
"bbox": {
"l": 2.0,
"t": 13.0,
"r": 5.0,
"b": 18.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -2346,7 +2421,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 3,
"bbox": {
"l": 0.0,
"t": 0.0,
"r": 3.0,
"b": 7.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -2813,7 +2903,22 @@
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"prov": [
{
"page_no": 3,
"bbox": {
"l": 4.0,
"t": 6.0,
"r": 7.0,
"b": 13.0,
"coord_origin": "TOPLEFT"
},
"charspan": [
0,
0
]
}
],
"captions": [],
"references": [],
"footnotes": [],
@ -3275,5 +3380,27 @@
],
"key_value_items": [],
"form_items": [],
"pages": {}
"pages": {
"1": {
"size": {
"width": 3.0,
"height": 7.0
},
"page_no": 1
},
"2": {
"size": {
"width": 9.0,
"height": 18.0
},
"page_no": 2
},
"3": {
"size": {
"width": 13.0,
"height": 36.0
},
"page_no": 3
}
}
}

View File

@ -1,13 +1,18 @@
import os
import logging
from pathlib import Path
import pytest
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
_log = logging.getLogger(__name__)
GENERATE = GEN_TEST_DATA
@ -28,13 +33,15 @@ def get_converter():
return converter
def test_e2e_xlsx_conversions():
@pytest.fixture(scope="module")
def documents() -> list[tuple[Path, DoclingDocument]]:
documents: list[dict[Path, DoclingDocument]] = []
xlsx_paths = get_xlsx_paths()
converter = get_converter()
for xlsx_path in xlsx_paths:
print(f"converting {xlsx_path}")
_log.debug(f"converting {xlsx_path}")
gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
@ -44,6 +51,14 @@ def test_e2e_xlsx_conversions():
doc: DoclingDocument = conv_result.document
assert doc, f"Failed to convert document from file {gt_path}"
documents.append((gt_path, doc))
return documents
def test_e2e_xlsx_conversions(documents) -> None:
for gt_path, doc in documents:
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
@ -57,3 +72,30 @@ def test_e2e_xlsx_conversions():
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
), "document document"
def test_pages(documents) -> None:
"""Test the page count and page size of converted documents.
Args:
documents: The paths and converted documents.
"""
# number of pages from the backend method
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,
filename=path.stem,
backend=MsExcelDocumentBackend,
)
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
assert backend.page_count() == 3
# number of pages from the converted document
doc = [item for path, item in documents if path.stem == "test-01"][0]
assert len(doc.pages) == 3
# page sizes as number of cells
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)