Merge branch 'main' of github.com:DS4SD/docling into dev/add-asr-pipeline-v2

This commit is contained in:
Christoph Auer 2025-06-23 09:08:58 +02:00
commit caf18e634b
98 changed files with 340943 additions and 330462 deletions

2
.github/dco.yml vendored Normal file
View File

@ -0,0 +1,2 @@
allowRemediationCommits:
individual: true

192
.github/workflows/dco-advisor.yml vendored Normal file
View File

@ -0,0 +1,192 @@
name: DCO Advisor Bot
on:
pull_request_target:
types: [opened, reopened, synchronize]
permissions:
pull-requests: write
issues: write
jobs:
dco_advisor:
runs-on: ubuntu-latest
steps:
- name: Handle DCO check result
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const pr = context.payload.pull_request || context.payload.check_run?.pull_requests?.[0];
if (!pr) return;
const prNumber = pr.number;
const baseRef = pr.base.ref;
const headSha =
context.payload.check_run?.head_sha ||
pr.head?.sha;
const username = pr.user.login;
console.log("HEAD SHA:", headSha);
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
// Poll until DCO check has a conclusion (max 6 attempts, 30s)
let dcoCheck = null;
for (let attempt = 0; attempt < 6; attempt++) {
const { data: checks } = await github.rest.checks.listForRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: headSha
});
console.log("All check runs:");
checks.check_runs.forEach(run => {
console.log(`- ${run.name} (${run.status}/${run.conclusion}) @ ${run.head_sha}`);
});
dcoCheck = checks.check_runs.find(run =>
run.name.toLowerCase().includes("dco") &&
!run.name.toLowerCase().includes("dco_advisor") &&
run.head_sha === headSha
);
if (dcoCheck?.conclusion) break;
console.log(`Waiting for DCO check... (${attempt + 1})`);
await sleep(5000); // wait 5 seconds
}
if (!dcoCheck || !dcoCheck.conclusion) {
console.log("DCO check did not complete in time.");
return;
}
const isFailure = ["failure", "action_required"].includes(dcoCheck.conclusion);
console.log(`DCO check conclusion for ${headSha}: ${dcoCheck.conclusion} (treated as ${isFailure ? "failure" : "success"})`);
// Parse DCO output for commit SHAs and author
let badCommits = [];
let authorName = "";
let authorEmail = "";
let moreInfo = `More info: [DCO check report](${dcoCheck?.html_url})`;
if (isFailure) {
const { data: commits } = await github.rest.pulls.listCommits({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: prNumber,
});
for (const commit of commits) {
const commitMessage = commit.commit.message;
const signoffMatch = commitMessage.match(/^Signed-off-by:\s+.+<.+>$/m);
if (!signoffMatch) {
console.log(`Bad commit found ${commit.sha}`)
badCommits.push({
sha: commit.sha,
authorName: commit.commit.author.name,
authorEmail: commit.commit.author.email,
});
}
}
}
// If multiple authors are present, you could adapt the message accordingly
// For now, we'll just use the first one
if (badCommits.length > 0) {
authorName = badCommits[0].authorName;
authorEmail = badCommits[0].authorEmail;
}
// Generate remediation commit message if needed
let remediationSnippet = "";
if (badCommits.length && authorEmail) {
remediationSnippet = `git commit --allow-empty -s -m "DCO Remediation Commit for ${authorName} <${authorEmail}>\n\n` +
badCommits.map(c => `I, ${c.authorName} <${c.authorEmail}>, hereby add my Signed-off-by to this commit: ${c.sha}`).join('\n') +
`"`;
} else {
remediationSnippet = "# Unable to auto-generate remediation message. Please check the DCO check details.";
}
// Build comment
const commentHeader = '<!-- dco-advice-bot -->';
let body = "";
if (isFailure) {
body = [
commentHeader,
'❌ **DCO Check Failed**',
'',
`Hi @${username}, your pull request has failed the Developer Certificate of Origin (DCO) check.`,
'',
'This repository supports **remediation commits**, so you can fix this without rewriting history — but you must follow the required message format.',
'',
'---',
'',
'### 🛠 Quick Fix: Add a remediation commit',
'Run this command:',
'',
'```bash',
remediationSnippet,
'git push',
'```',
'',
'---',
'',
'<details>',
'<summary>🔧 Advanced: Sign off each commit directly</summary>',
'',
'**For the latest commit:**',
'```bash',
'git commit --amend --signoff',
'git push --force-with-lease',
'```',
'',
'**For multiple commits:**',
'```bash',
`git rebase --signoff origin/${baseRef}`,
'git push --force-with-lease',
'```',
'',
'</details>',
'',
moreInfo
].join('\n');
} else {
body = [
commentHeader,
'✅ **DCO Check Passed**',
'',
`Thanks @${username}, all your commits are properly signed off. 🎉`
].join('\n');
}
// Get existing comments on the PR
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber
});
// Look for a previous bot comment
const existingComment = comments.find(c =>
c.body.includes("<!-- dco-advice-bot -->")
);
if (existingComment) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existingComment.id,
body: body
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: body
});
}

View File

@ -1,3 +1,26 @@
## [v2.37.0](https://github.com/docling-project/docling/releases/tag/v2.37.0) - 2025-06-16
### Feature
* Make Page.parsed_page the only source of truth for text cells, add OCR cells to it ([#1745](https://github.com/docling-project/docling/issues/1745)) ([`7d3302c`](https://github.com/docling-project/docling/commit/7d3302cb48dd91cd29673d7c4eaf7326736d0685))
* Support xlsm files ([#1520](https://github.com/docling-project/docling/issues/1520)) ([`df14022`](https://github.com/docling-project/docling/commit/df140227c3b8bcad0c68bf3d129930cccd96a07e))
### Fix
* Pptx line break and space handling ([#1664](https://github.com/docling-project/docling/issues/1664)) ([`f28d23c`](https://github.com/docling-project/docling/commit/f28d23cf03d059619d1d3482594596ab7c87d197))
* **asciidoc:** Set default size when missing in image directive ([#1769](https://github.com/docling-project/docling/issues/1769)) ([`b886e4d`](https://github.com/docling-project/docling/commit/b886e4df312447d39f58cf6e3c45b0f863940321))
* Handle NoneType error in MsPowerpointDocumentBackend ([#1747](https://github.com/docling-project/docling/issues/1747)) ([`7a275c7`](https://github.com/docling-project/docling/commit/7a275c763731d9c96b7cf32f2e27b8dc8bebacd7))
* Prov for merged-elems ([#1728](https://github.com/docling-project/docling/issues/1728)) ([`6613b9e`](https://github.com/docling-project/docling/commit/6613b9e98bc8b89791dc0334de8970ff243aba82))
* **tesseract:** Initialize df_osd to avoid uninitialized variable error ([#1718](https://github.com/docling-project/docling/issues/1718)) ([`e979750`](https://github.com/docling-project/docling/commit/e979750ce93b2fae89dbb60ff06333f80c1c2908))
* Allow custom torch_dtype in vlm models ([#1735](https://github.com/docling-project/docling/issues/1735)) ([`f7f3113`](https://github.com/docling-project/docling/commit/f7f31137f10999fefdb70da7e5ef56536f650400))
* Improve extraction from textboxes in Word docs ([#1701](https://github.com/docling-project/docling/issues/1701)) ([`9dbcb3d`](https://github.com/docling-project/docling/commit/9dbcb3d7d4f27d1c935c8681c57ed59524452d53))
* Add WEBP to the list of image file extensions ([#1711](https://github.com/docling-project/docling/issues/1711)) ([`a2b83fe`](https://github.com/docling-project/docling/commit/a2b83fe4aea66c273a83bf17177e87d45d3f18d1))
### Documentation
* Update vlm models api examples with LM Studio ([#1759](https://github.com/docling-project/docling/issues/1759)) ([`0432a31`](https://github.com/docling-project/docling/commit/0432a31b2f7c9fe944c3a1d4b608ef938b4f2299))
* Add open webui ([#1734](https://github.com/docling-project/docling/issues/1734)) ([`49b10e7`](https://github.com/docling-project/docling/commit/49b10e74191d4d580c9305ac08d9898a79346d7d))
## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04 ## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04
### Fix ### Fix

View File

@ -2,7 +2,7 @@ import logging
import re import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Final, Set, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
DEFAULT_IMAGE_WIDTH: Final = 128
DEFAULT_IMAGE_HEIGHT: Final = 128
class AsciiDocBackend(DeclarativeDocumentBackend): class AsciiDocBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
item = self._parse_picture(line) item = self._parse_picture(line)
size = None size: Size
if "width" in item and "height" in item: if "width" in item and "height" in item:
size = Size(width=int(item["width"]), height=int(item["height"])) size = Size(width=int(item["width"]), height=int(item["height"]))
else:
size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)
uri = None uri = None
if ( if (
@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return doc return doc
def _get_current_level(self, parents): @staticmethod
def _get_current_level(parents):
for k, v in parents.items(): for k, v in parents.items():
if v is None and k > 0: if v is None and k > 0:
return k - 1 return k - 1
return 0 return 0
def _get_current_parent(self, parents): @staticmethod
def _get_current_parent(parents):
for k, v in parents.items(): for k, v in parents.items():
if v is None and k > 0: if v is None and k > 0:
return parents[k - 1] return parents[k - 1]
@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return None return None
# ========= Title # ========= Title
def _is_title(self, line): @staticmethod
def _is_title(line):
return re.match(r"^= ", line) return re.match(r"^= ", line)
def _parse_title(self, line): @staticmethod
def _parse_title(line):
return {"type": "title", "text": line[2:].strip(), "level": 0} return {"type": "title", "text": line[2:].strip(), "level": 0}
# ========= Section headers # ========= Section headers
def _is_section_header(self, line): @staticmethod
def _is_section_header(line):
return re.match(r"^==+\s+", line) return re.match(r"^==+\s+", line)
def _parse_section_header(self, line): @staticmethod
def _parse_section_header(line):
match = re.match(r"^(=+)\s+(.*)", line) match = re.match(r"^(=+)\s+(.*)", line)
marker = match.group(1) # The list marker (e.g., "*", "-", "1.") marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
} }
# ========= Lists # ========= Lists
def _is_list_item(self, line): @staticmethod
def _is_list_item(line):
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line) return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
def _parse_list_item(self, line): @staticmethod
def _parse_list_item(line):
"""Extract the item marker (number or bullet symbol) and the text of the item.""" """Extract the item marker (number or bullet symbol) and the text of the item."""
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line) match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
} }
# ========= Tables # ========= Tables
def _is_table_line(self, line): @staticmethod
def _is_table_line(line):
return re.match(r"^\|.*\|", line) return re.match(r"^\|.*\|", line)
def _parse_table_line(self, line): @staticmethod
def _parse_table_line(line):
# Split table cells and trim extra spaces # Split table cells and trim extra spaces
return [cell.strip() for cell in line.split("|") if cell.strip()] return [cell.strip() for cell in line.split("|") if cell.strip()]
def _populate_table_as_grid(self, table_data): @staticmethod
def _populate_table_as_grid(table_data):
num_rows = len(table_data) num_rows = len(table_data)
# Adjust the table data into a grid format # Adjust the table data into a grid format
@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return data return data
# ========= Pictures # ========= Pictures
def _is_picture(self, line): @staticmethod
def _is_picture(line):
return re.match(r"^image::", line) return re.match(r"^image::", line)
def _parse_picture(self, line): @staticmethod
def _parse_picture(line):
""" """
Parse an image macro, extracting its path and attributes. Parse an image macro, extracting its path and attributes.
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center] Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return {"type": "picture", "uri": line} return {"type": "picture", "uri": line}
# ========= Captions # ========= Captions
def _is_caption(self, line): @staticmethod
def _is_caption(line):
return re.match(r"^\.(.+)", line) return re.match(r"^\.(.+)", line)
def _parse_caption(self, line): @staticmethod
def _parse_caption(line):
mtch = re.match(r"^\.(.+)", line) mtch = re.match(r"^\.(.+)", line)
if mtch: if mtch:
text = mtch.group(1) text = mtch.group(1)
@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return {"type": "caption", "text": ""} return {"type": "caption", "text": ""}
# ========= Plain text # ========= Plain text
def _parse_text(self, line): @staticmethod
def _parse_text(line):
return {"type": "text", "text": line.strip()} return {"type": "text", "text": line.strip()}

View File

@ -7,12 +7,17 @@ from typing import List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v1 from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid: if not self.valid:
return "" return ""
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
return text_piece return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]: def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_lines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]: def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = [] return self._compute_text_cells()
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge:
# draw_clusters_and_cells()
# cells = merge_horizontal_cells(cells)
# after merge:
# draw_clusters_and_cells()
return cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32 AREA_THRESHOLD = 0 # 32 * 32

View File

@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v2 from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.base_models import Size from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock from docling.utils.locks import pypdfium2_lock
@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse v2 data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid: if not self.valid:
return "" return ""
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]: def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]: def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = [] return self._compute_text_cells()
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
return cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32 AREA_THRESHOLD = 0 # 32 * 32

View File

@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
return self._dpage return self._dpage
def get_text_cells(self) -> Iterable[TextCell]: def get_text_cells(self) -> Iterable[TextCell]:
page_size = self.get_size()
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
# for cell in self._dpage.textline_cells:
# rect = cell.rect
#
# assert (
# rect.to_bounding_box().l <= rect.to_bounding_box().r
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
# assert (
# rect.to_bounding_box().t <= rect.to_bounding_box().b
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
return self._dpage.textline_cells return self._dpage.textline_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
self, page_no: int, create_words: bool = True, create_textlines: bool = True self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend: ) -> DoclingParseV4PageBackend:
with pypdfium2_lock: with pypdfium2_lock:
seg_page = self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
)
# In Docling, all TextCell instances are expected with top-left origin.
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.textline_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.char_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.word_cells
]
return DoclingParseV4PageBackend( return DoclingParseV4PageBackend(
self.dp_doc.get_page( seg_page,
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
),
self._pdoc[page_no], self._pdoc[page_no],
) )

View File

@ -1,17 +1,15 @@
import logging import logging
import re import re
import warnings import warnings
from copy import deepcopy
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import List, Optional, Set, Union from typing import List, Optional, Set, Union
import marko import marko
import marko.element import marko.element
import marko.ext
import marko.ext.gfm
import marko.inline import marko.inline
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
@ -21,7 +19,9 @@ from docling_core.types.doc import (
TableData, TableData,
TextItem, TextItem,
) )
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from marko import Markdown from marko import Markdown
from pydantic import AnyUrl, TypeAdapter
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False self.in_table = False
self.md_table_buffer: list[str] = [] self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0 self._html_blocks: int = 0
try: try:
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data) doc.add_table(data=table_data)
return return
def _process_inline_text(
self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_item,
text=txt,
)
self.inline_texts = []
def _iterate_elements( # noqa: C901 def _iterate_elements( # noqa: C901
self, self,
*,
element: marko.element.Element, element: marko.element.Element,
depth: int, depth: int,
doc: DoclingDocument, doc: DoclingDocument,
visited: Set[marko.element.Element], visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None, parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
): ):
if element in visited: if element in visited:
return return
@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Check for different element types and process relevant details # Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0: if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug( _log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
) )
if element.level == 1:
doc_label = DocItemLabel.TITLE if len(element.children) == 1:
child = element.children[0]
snippet_text = str(child.children) # type: ignore
visited.add(child)
else: else:
doc_label = DocItemLabel.SECTION_HEADER snippet_text = "" # inline group will be created
# Header could have arbitrary inclusion of bold, italic or emphasis, if element.level == 1:
# hence we need to traverse the tree to get full text of a header parent_item = doc.add_title(
strings: List[str] = [] text=snippet_text,
parent=parent_item,
# Define a recursive function to traverse the tree formatting=formatting,
def traverse(node: marko.block.BlockElement): hyperlink=hyperlink,
# Check if the node has a "children" attribute )
if hasattr(node, "children"): else:
# If "children" is a list, continue traversal parent_item = doc.add_heading(
if isinstance(node.children, list): text=snippet_text,
for child in node.children: level=element.level - 1,
traverse(child) parent=parent_item,
# If "children" is text, add it to header text formatting=formatting,
elif isinstance(node.children, str): hyperlink=hyperlink,
strings.append(node.children) )
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
if doc_label == DocItemLabel.SECTION_HEADER:
parent_item = doc.add_heading(
text=snippet_text,
level=element.level - 1,
parent=parent_item,
)
else:
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
)
elif isinstance(element, marko.block.List): elif isinstance(element, marko.block.List):
has_non_empty_list_items = False has_non_empty_list_items = False
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
break break
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items: if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif ( elif (
isinstance(element, marko.block.ListItem) isinstance(element, marko.block.ListItem)
and len(element.children) > 0 and len(element.children) == 1
and isinstance((first_child := element.children[0]), marko.block.Paragraph) and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0
): ):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(" - List item") _log.debug(" - List item")
snippet_text = str(first_child.children[0].children) # type: ignore if len(child.children) == 1:
is_numbered = False snippet_text = str(child.children[0].children) # type: ignore
if ( visited.add(child)
parent_item is not None else:
and isinstance(parent_item, DocItem) snippet_text = "" # inline group will be created
and parent_item.label == GroupLabel.ORDERED_LIST is_numbered = isinstance(parent_item, OrderedList)
): if not isinstance(parent_item, (OrderedList, UnorderedList)):
is_numbered = True _log.warning("ListItem would have not had a list parent, adding one.")
doc.add_list_item( parent_item = doc.add_unordered_list(parent=parent_item)
enumerated=is_numbered, parent=parent_item, text=snippet_text parent_item = doc.add_list_item(
enumerated=is_numbered,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
) )
visited.add(first_child)
elif isinstance(element, marko.inline.Image): elif isinstance(element, marko.inline.Image):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}") _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "": if element.title is not None and element.title != "":
fig_caption = doc.add_text( fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title label=DocItemLabel.CAPTION,
text=element.title,
formatting=formatting,
hyperlink=hyperlink,
) )
doc.add_picture(parent=parent_item, caption=fig_caption) doc.add_picture(parent=parent_item, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0: elif isinstance(element, marko.inline.Emphasis):
self._process_inline_text(parent_item, doc) _log.debug(f" - Emphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.italic = True
elif isinstance(element, marko.inline.StrongEmphasis):
_log.debug(f" - StrongEmphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.bold = True
elif isinstance(element, marko.inline.Link):
_log.debug(f" - Link: {element.children}")
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
element.dest
)
elif isinstance(element, marko.inline.RawText): elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}") _log.debug(f" - Paragraph (raw text): {element.children}")
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
else: else:
self.md_table_buffer.append(snippet_text) self.md_table_buffer.append(snippet_text)
else: elif snippet_text:
self._close_table(doc) self._close_table(doc)
# most likely just inline text doc.add_text(
self.inline_texts.append(str(element.children)) label=DocItemLabel.TEXT,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.CodeSpan): elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}") _log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip() snippet_text = str(element.children).strip()
doc.add_code(parent=parent_item, text=snippet_text) doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif ( elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode)) isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0 and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText) and isinstance((child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0 and len(snippet_text := (child.children.strip())) > 0
): ):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
doc.add_code(parent=parent_item, text=snippet_text) doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.LineBreak): elif isinstance(element, marko.inline.LineBreak):
if self.in_table: if self.in_table:
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock): elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1 self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc) self._close_table(doc)
_log.debug(f"HTML Block: {element}") _log.debug(f"HTML Block: {element}")
if ( if (
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# wrap in markers to enable post-processing in convert() # wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}" text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_item, text=text_to_add) doc.add_code(
parent=parent_item,
text=text_to_add,
formatting=formatting,
hyperlink=hyperlink,
)
else: else:
if not isinstance(element, str): if not isinstance(element, str):
self._close_table(doc) self._close_table(doc)
_log.debug(f"Some other element: {element}") _log.debug(f"Some other element: {element}")
if (
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
and len(element.children) > 1
):
parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = ( processed_block_types = (
marko.block.Heading, # marko.block.Heading,
marko.block.CodeBlock, marko.block.CodeBlock,
marko.block.FencedCode, marko.block.FencedCode,
marko.inline.RawText, marko.inline.RawText,
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc=doc, doc=doc,
visited=visited, visited=visited,
parent_item=parent_item, parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
) )
def is_valid(self) -> bool: def is_valid(self) -> bool:
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parent_item=None, parent_item=None,
visited=set(), visited=set(),
) )
self._process_inline_text(None, doc) # handle last hanging inline text
self._close_table(doc=doc) # handle any last hanging table self._close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend # if HTML blocks were detected, export to HTML and delegate to HTML backend

View File

@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from pptx import Presentation from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from pptx.oxml.text import CT_TextLineBreak
from docling.backend.abstract_backend import ( from docling.backend.abstract_backend import (
DeclarativeDocumentBackend, DeclarativeDocumentBackend,
@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901 def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
is_a_list = False
is_list_group_created = False is_list_group_created = False
enum_list_item_value = 0 enum_list_item_value = 0
new_list = None new_list = None
bullet_type = "None"
list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size) prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
# Identify if shape contains lists def is_list_item(paragraph):
for paragraph in shape.text_frame.paragraphs: """Check if the paragraph is a list item."""
# Check if paragraph is a bullet point using the `element` XML
p = paragraph._element p = paragraph._element
if ( if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]}) p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None is not None
): ):
bullet_type = "Bullet" return (True, "Bullet")
is_a_list = True
elif ( elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]}) p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None is not None
): ):
bullet_type = "Numbered" return (True, "Numbered")
is_a_list = True elif paragraph.level > 0:
else:
is_a_list = False
if paragraph.level > 0:
# Most likely a sub-list # Most likely a sub-list
is_a_list = True return (True, "None")
if is_a_list:
# Determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
if bullet_type == "Numbered":
list_label = GroupLabel.ORDERED_LIST
if is_a_list:
_log.debug("LIST DETECTED!")
else: else:
_log.debug("No List") return (False, "None")
# If there is a list inside of the shape, create a new docling list to assign list items to
# if is_a_list:
# new_list = doc.add_group(
# label=list_label, name=f"list", parent=parent_slide
# )
# Iterate through paragraphs to build up text # Iterate through paragraphs to build up text
for paragraph in shape.text_frame.paragraphs: for paragraph in shape.text_frame.paragraphs:
# p_text = paragraph.text.strip() is_a_list, bullet_type = is_list_item(paragraph)
p = paragraph._element p = paragraph._element
enum_list_item_value += 1
inline_paragraph_text = ""
inline_list_item_text = ""
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}): # Convert line breaks to spaces and accumulate text
if len(e.text.strip()) > 0: p_text = ""
e_is_a_list_item = False for e in p.content_children:
is_numbered = False if isinstance(e, CT_TextLineBreak):
if ( p_text += " "
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]}) else:
is not None p_text += e.text
):
bullet_type = "Bullet"
e_is_a_list_item = True
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Numbered"
is_numbered = True
e_is_a_list_item = True
else:
e_is_a_list_item = False
if e_is_a_list_item: if is_a_list:
if len(inline_paragraph_text) > 0: enum_marker = ""
# output accumulated inline text: enumerated = bullet_type == "Numbered"
doc.add_text(
label=doc_label, if not is_list_group_created:
parent=parent_slide, new_list = doc.add_group(
text=inline_paragraph_text, label=GroupLabel.ORDERED_LIST
prov=prov, if enumerated
) else GroupLabel.LIST,
# Set marker and enumerated arguments if this is an enumeration element. name="list",
inline_list_item_text += e.text parent=parent_slide,
# print(e.text) )
else: is_list_group_created = True
# Assign proper label to the text, depending if it's a Title or Section Header enum_list_item_value = 0
# For other types of text, assign - PARAGRAPH
doc_label = DocItemLabel.PARAGRAPH if enumerated:
if shape.is_placeholder: enum_list_item_value += 1
placeholder_type = shape.placeholder_format.type enum_marker = str(enum_list_item_value) + "."
if placeholder_type in [
PP_PLACEHOLDER.CENTER_TITLE, doc.add_list_item(
PP_PLACEHOLDER.TITLE, marker=enum_marker,
]: enumerated=enumerated,
# It's a title parent=new_list,
doc_label = DocItemLabel.TITLE text=p_text,
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: prov=prov,
DocItemLabel.SECTION_HEADER )
enum_list_item_value = 0 else: # is paragraph not a list item
inline_paragraph_text += e.text # Assign proper label to the text, depending if it's a Title or Section Header
# For other types of text, assign - PARAGRAPH
doc_label = DocItemLabel.PARAGRAPH
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type in [
PP_PLACEHOLDER.CENTER_TITLE,
PP_PLACEHOLDER.TITLE,
]:
# It's a title
doc_label = DocItemLabel.TITLE
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
DocItemLabel.SECTION_HEADER
if len(inline_paragraph_text) > 0:
# output accumulated inline text: # output accumulated inline text:
doc.add_text( doc.add_text(
label=doc_label, label=doc_label,
parent=parent_slide, parent=parent_slide,
text=inline_paragraph_text, text=p_text,
prov=prov,
)
if len(inline_list_item_text) > 0:
enum_marker = ""
if is_numbered:
enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
new_list = doc.add_group(
label=list_label, name="list", parent=parent_slide
)
is_list_group_created = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_list,
text=inline_list_item_text,
prov=prov, prov=prov,
) )
return return

View File

@ -14,7 +14,7 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from docling_core.types.doc.document import Formatting from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from docx import Document from docx import Document
from docx.document import Document as DocxDocument from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc from docx.oxml.table import CT_Tc
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.valid = True self.valid = True
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
) from e ) from e
@override @override
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_tables(element, docx_obj, doc) self._handle_tables(element, docx_obj, doc)
except Exception: except Exception:
_log.debug("could not parse a table, broken docx table") _log.debug("could not parse a table, broken docx table")
# Check for Image
elif drawing_blip: elif drawing_blip:
self._handle_pictures(docx_obj, drawing_blip, doc) self._handle_pictures(docx_obj, drawing_blip, doc)
# Check for Text after the Image
if (
tag_name in ["p"]
and element.find(".//w:t", namespaces=namespaces) is not None
):
self._handle_text_elements(element, docx_obj, doc)
# Check for the sdt containers, like table of contents # Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]: elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_text_elements(element, docx_obj, doc) self._handle_text_elements(element, docx_obj, doc)
else: else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}") _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc return doc
def _str_to_int( def _str_to_int(
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
all_paragraphs = [] all_paragraphs = []
# Sort paragraphs within each container, then process containers # Sort paragraphs within each container, then process containers
for container_id, paragraphs in container_paragraphs.items(): for paragraphs in container_paragraphs.values():
# Sort by vertical position within each container # Sort by vertical position within each container
sorted_container_paragraphs = sorted( sorted_container_paragraphs = sorted(
paragraphs, paragraphs,
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument, doc: DoclingDocument,
) -> None: ) -> None:
paragraph = Paragraph(element, docx_obj) paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text( text, equations = self._handle_equations_in_text(
element=element, text=paragraph.text element=element, text=paragraph.text
) )
if text is None: if text is None:
return return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip() text = text.strip()
# Common styles for bullet and numbered lists. # Common styles for bullet and numbered lists.
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
return return
def _add_formatted_list_item(
self,
doc: DoclingDocument,
elements: list,
marker: str,
enumerated: bool,
level: int,
) -> None:
# This should not happen by construction
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
return
if len(elements) == 1:
text, format, hyperlink = elements[0]
doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text=text,
formatting=format,
hyperlink=hyperlink,
)
else:
new_item = doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text="",
)
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
for text, format, hyperlink in elements:
doc.add_text(
label=DocItemLabel.TEXT,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
def _add_list_item( def _add_list_item(
self, self,
*, *,
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elements: list, elements: list,
is_numbered: bool = False, is_numbered: bool = False,
) -> None: ) -> None:
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return None
enum_marker = "" enum_marker = ""
level = self._get_level() level = self._get_level()
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
new_parent = self._create_or_reuse_parent( self._add_formatted_list_item(
doc=doc, doc, elements, enum_marker, is_numbered, level
prev_parent=self.parents[level],
paragraph_elements=elements,
) )
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif ( elif (
self._prev_numid() == numid self._prev_numid() == numid
and self.level_at_new_list is not None and self.level_at_new_list is not None
@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
self._add_formatted_list_item(
new_parent = self._create_or_reuse_parent( doc,
doc=doc, elements,
prev_parent=self.parents[self.level_at_new_list + ilevel], enum_marker,
paragraph_elements=elements, is_numbered,
self.level_at_new_list + ilevel,
) )
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif ( elif (
self._prev_numid() == numid self._prev_numid() == numid
and self.level_at_new_list is not None and self.level_at_new_list is not None
and prev_indent is not None and prev_indent is not None
and ilevel < prev_indent and ilevel < prev_indent
): # Close list ): # Close list
for k, v in self.parents.items(): for k in self.parents:
if k > self.level_at_new_list + ilevel: if k > self.level_at_new_list + ilevel:
self.parents[k] = None self.parents[k] = None
@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
new_parent = self._create_or_reuse_parent( self._add_formatted_list_item(
doc=doc, doc,
prev_parent=self.parents[self.level_at_new_list + ilevel], elements,
paragraph_elements=elements, enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
) )
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.listIter = 0 self.listIter = 0
elif self._prev_numid() == numid or prev_indent == ilevel: elif self._prev_numid() == numid or prev_indent == ilevel:
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
new_parent = self._create_or_reuse_parent( self._add_formatted_list_item(
doc=doc, doc, elements, enum_marker, is_numbered, level - 1
prev_parent=self.parents[level - 1],
paragraph_elements=elements,
) )
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
return return
def _handle_tables( def _handle_tables(

View File

@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError from pypdfium2._helpers.misc import PdfiumError
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.utils.locks import pypdfium2_lock from docling.utils.locks import pypdfium2_lock
def get_pdf_page_geometry(
ppage: pdfium.PdfPage,
angle: float = 0.0,
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
) -> PdfPageGeometry:
"""
Create PdfPageGeometry from a pypdfium2 PdfPage object.
Args:
ppage: pypdfium2 PdfPage object
angle: Page rotation angle in degrees (default: 0.0)
boundary_type: The boundary type for the page (default: CROP_BOX)
Returns:
PdfPageGeometry with all the different bounding boxes properly set
"""
with pypdfium2_lock:
# Get the main bounding box (intersection of crop_box and media_box)
bbox_tuple = ppage.get_bbox()
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
# Get all the different page boxes from pypdfium2
media_box_tuple = ppage.get_mediabox()
crop_box_tuple = ppage.get_cropbox()
art_box_tuple = ppage.get_artbox()
bleed_box_tuple = ppage.get_bleedbox()
trim_box_tuple = ppage.get_trimbox()
# Convert to BoundingBox objects using existing from_tuple method
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
# Use bbox as fallback when specific box types are not defined
media_bbox = (
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
if media_box_tuple
else bbox
)
crop_bbox = (
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
if crop_box_tuple
else bbox
)
art_bbox = (
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
if art_box_tuple
else bbox
)
bleed_bbox = (
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
if bleed_box_tuple
else bbox
)
trim_bbox = (
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
if trim_box_tuple
else bbox
)
return PdfPageGeometry(
angle=angle,
rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=boundary_type,
art_bbox=art_bbox,
bleed_bbox=bleed_bbox,
crop_bbox=crop_bbox,
media_bbox=media_bbox,
trim_bbox=trim_bbox,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def _compute_text_cells(self) -> List[TextCell]:
AREA_THRESHOLD = 0 # 32 * 32 """Compute text cells from pypdfium."""
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
with pypdfium2_lock: with pypdfium2_lock:
if not self.text_page: if not self.text_page:
self.text_page = self._ppage.get_textpage() self.text_page = self._ppage.get_textpage()
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
return merged_cells return merged_cells
def draw_clusters_and_cells(): return merge_horizontal_cells(cells)
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
# draw_clusters_and_cells() AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
cells = merge_horizontal_cells(cells) if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
# after merge: yield cropbox
# draw_clusters_and_cells()
return cells def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
return self._compute_text_cells()
def get_page_image( def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None self, scale: float = 1, cropbox: Optional[BoundingBox] = None

View File

@ -235,7 +235,6 @@ class Page(BaseModel):
page_no: int page_no: int
# page_hash: Optional[str] = None # page_hash: Optional[str] = None
size: Optional[Size] = None size: Optional[Size] = None
cells: List[TextCell] = []
parsed_page: Optional[SegmentedPdfPage] = None parsed_page: Optional[SegmentedPdfPage] = None
predictions: PagePredictions = PagePredictions() predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None
@ -248,12 +247,27 @@ class Page(BaseModel):
float, Image float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling. ] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
def cells(self) -> List[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
else:
return []
def get_image( def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None self,
scale: float = 1.0,
max_size: Optional[int] = None,
cropbox: Optional[BoundingBox] = None,
) -> Optional[Image]: ) -> Optional[Image]:
if self._backend is None: if self._backend is None:
return self._image_cache.get(scale, None) return self._image_cache.get(scale, None)
if max_size:
assert self.size is not None
scale = min(scale, max_size / max(self.size.as_tuple()))
if scale not in self._image_cache: if scale not in self._image_cache:
if cropbox is None: if cropbox is None:
self._image_cache[scale] = self._backend.get_page_image(scale=scale) self._image_cache[scale] = self._backend.get_page_image(scale=scale)

View File

@ -302,7 +302,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
), ),
) )
generate_parsed_pages: bool = False generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
class ProcessingPipeline(str, Enum): class ProcessingPipeline(str, Enum):

View File

@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
class BaseVlmOptions(BaseModel): class BaseVlmOptions(BaseModel):
kind: str kind: str
prompt: str prompt: str
scale: float = 2.0
max_size: Optional[int] = None
class ResponseFormat(str, Enum): class ResponseFormat(str, Enum):
@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
AcceleratorDevice.MPS, AcceleratorDevice.MPS,
] ]
scale: float = 2.0
temperature: float = 0.0 temperature: float = 0.0
stop_strings: List[str] = [] stop_strings: List[str] = []
extra_generation_config: Dict[str, Any] = {} extra_generation_config: Dict[str, Any] = {}
@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
) # Default to ollama ) # Default to ollama
headers: Dict[str, str] = {} headers: Dict[str, str] = {}
params: Dict[str, Any] = {} params: Dict[str, Any] = {}
scale: float = 2.0
timeout: float = 60 timeout: float = 60
concurrency: int = 1 concurrency: int = 1
response_format: ResponseFormat response_format: ResponseFormat

View File

@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
with TimeRecorder(conv_res, "vlm"): with TimeRecorder(conv_res, "vlm"):
assert page.size is not None assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale) hi_res_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
assert hi_res_image is not None assert hi_res_image is not None
if hi_res_image: if hi_res_image:
if hi_res_image.mode != "RGB": if hi_res_image.mode != "RGB":

View File

@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
coord_origin=bbox.coord_origin, coord_origin=bbox.coord_origin,
) )
page_ix = element_prov.page_no - 1 page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
cropped_image = conv_res.pages[page_ix].get_image( cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=expanded_bbox scale=self.images_scale, cropbox=expanded_bbox
) )

View File

@ -7,6 +7,7 @@ from typing import List, Optional, Type
import numpy as np import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from rtree import index from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label from scipy.ndimage import binary_dilation, find_objects, label
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
return [] return []
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell. # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def _filter_ocr_cells(self, ocr_cells, programmatic_cells): def _filter_ocr_cells(
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
) -> List[TextCell]:
# Create R-tree index for programmatic cells # Create R-tree index for programmatic cells
p = index.Property() p = index.Property()
p.dimension = 2 p.dimension = 2
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
] ]
return filtered_ocr_cells return filtered_ocr_cells
def post_process_cells(self, ocr_cells, programmatic_cells): def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
r""" r"""
Post-process the ocr and programmatic cells and return the final list of of cells Post-process the OCR cells and update the page object.
Updates parsed_page.textline_cells directly since page.cells is now read-only.
""" """
if self.options.force_full_page_ocr: # Get existing cells from the read-only property
# If a full page OCR is forced, use only the OCR cells existing_cells = page.cells
cells = ocr_cells
return cells
## Remove OCR cells which overlap with programmatic cells. # Combine existing and OCR cells with overlap filtering
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells) final_cells = self._combine_cells(existing_cells, ocr_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells assert page.parsed_page is not None
# Update parsed_page.textline_cells directly
page.parsed_page.textline_cells = final_cells
page.parsed_page.has_lines = len(final_cells) > 0
def _combine_cells(
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
) -> List[TextCell]:
"""Combine existing and OCR cells with filtering and re-indexing."""
if self.options.force_full_page_ocr:
combined = ocr_cells
else:
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
combined = list(existing_cells) + filtered_ocr_cells
# Re-index in-place
for i, cell in enumerate(combined):
cell.index = i
return combined
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)

View File

@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
# Apply postprocessing # Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor( processed_clusters, processed_cells = LayoutPostprocessor(
page.cells, clusters, page.size page, clusters
).postprocess() ).postprocess()
# processed_clusters, processed_cells = clusters, page.cells # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings( warnings.filterwarnings(
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
) )
) )
page.cells = processed_cells
page.predictions.layout = LayoutPrediction( page.predictions.layout = LayoutPrediction(
clusters=processed_clusters clusters=processed_clusters
) )

View File

@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -2,7 +2,7 @@ import re
import warnings import warnings
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Literal, Optional
import numpy as np import numpy as np
from PIL import ImageDraw from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel): class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float] images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel): class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page: def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None assert page._backend is not None
page.cells = list(page._backend.get_text_cells()) page.parsed_page = page._backend.get_segmented_page()
assert page.parsed_page is not None
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page # Rate the text quality from the PDF parser, and aggregate on page
text_scores = [] text_scores = []

View File

@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -124,7 +124,7 @@ class ReadingOrderModel:
page_no = page.page_no + 1 page_no = page.page_no + 1
size = page.size size = page.size
assert size is not None assert size is not None, "Page size is not initialized."
out_doc.add_page(page_no=page_no, size=size) out_doc.add_page(page_no=page_no, size=size)

View File

@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
all_ocr_cells.append(cell) all_ocr_cells.append(cell)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
with TimeRecorder(conv_res, "vlm"): with TimeRecorder(conv_res, "vlm"):
assert page.size is not None assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale) hi_res_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
# Define prompt structure # Define prompt structure
prompt = self.formulate_prompt() prompt = self.formulate_prompt()

View File

@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"): with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
assert page.size is not None assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale) hi_res_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
if hi_res_image is not None: if hi_res_image is not None:
im_width, im_height = hi_res_image.size im_width, im_height = hi_res_image.size

View File

@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
) )
raise e raise e
# Filter out uninitialized pages (those with size=None) that may remain
# after timeout or processing failures to prevent assertion errors downstream
initial_page_count = len(conv_res.pages)
conv_res.pages = [page for page in conv_res.pages if page.size is not None]
if len(conv_res.pages) < initial_page_count:
_log.info(
f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
f"due to timeout or processing failures"
)
return conv_res return conv_res
def _unload(self, conv_res: ConversionResult) -> ConversionResult: def _unload(self, conv_res: ConversionResult) -> ConversionResult:

View File

@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
PagePreprocessingModel( PagePreprocessingModel(
options=PagePreprocessingOptions( options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale, images_scale=pipeline_options.images_scale,
create_parsed_page=pipeline_options.generate_parsed_pages,
) )
), ),
# OCR # OCR

View File

@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
from docling_core.types.doc.page import TextCell from docling_core.types.doc.page import TextCell
from rtree import index from rtree import index
from docling.datamodel.base_models import BoundingBox, Cluster from docling.datamodel.base_models import BoundingBox, Cluster, Page
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -194,11 +194,11 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
} }
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size): def __init__(self, page: Page, clusters: List[Cluster]) -> None:
"""Initialize processor with cells and clusters.""" """Initialize processor with page and clusters."""
"""Initialize processor with cells and spatial indices.""" self.cells = page.cells
self.cells = cells self.page = page
self.page_size = page_size self.page_size = page.size
self.all_clusters = clusters self.all_clusters = clusters
self.regular_clusters = [ self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES c for c in clusters if c.label not in self.SPECIAL_TYPES
@ -240,6 +240,10 @@ class LayoutPostprocessor:
for child in cluster.children: for child in cluster.children:
child.cells = self._sort_cells(child.cells) child.cells = self._sort_cells(child.cells)
assert self.page.parsed_page is not None
self.page.parsed_page.textline_cells = self.cells
self.page.parsed_page.has_lines = len(self.cells) > 0
return final_clusters, self.cells return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]: def _process_regular_clusters(self) -> List[Cluster]:
@ -301,6 +305,7 @@ class LayoutPostprocessor:
special_clusters = self._handle_cross_type_overlaps(special_clusters) special_clusters = self._handle_cross_type_overlaps(special_clusters)
# Calculate page area from known page size # Calculate page area from known page size
assert self.page_size is not None
page_area = self.page_size.width * self.page_size.height page_area = self.page_size.width * self.page_size.height
if page_area > 0: if page_area > 0:
# Filter out full-page pictures # Filter out full-page pictures

View File

@ -121,14 +121,15 @@ def export_documents(
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_paths = [ input_doc_paths = [
Path("./tests/data/pdf/2206.01062.pdf"), data_folder / "pdf/2206.01062.pdf",
Path("./tests/data/pdf/2203.01017v2.pdf"), data_folder / "pdf/2203.01017v2.pdf",
Path("./tests/data/pdf/2305.03393v1.pdf"), data_folder / "pdf/2305.03393v1.pdf",
Path("./tests/data/pdf/redp5110_sampled.pdf"), data_folder / "pdf/redp5110_sampled.pdf",
] ]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) # buf = BytesIO((data_folder / "pdf/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)] # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs) # input = DocumentConversionInput.from_streams(docs)

View File

@ -16,7 +16,8 @@ _log = logging.getLogger(__name__)
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
########################################################################### ###########################################################################

View File

@ -71,7 +71,8 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2203.01017v2.pdf"
pipeline_options = ExampleFormulaUnderstandingPipelineOptions() pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
pipeline_options.do_formula_understanding = True pipeline_options.do_formula_understanding = True

View File

@ -76,7 +76,8 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
pipeline_options = ExamplePictureClassifierPipelineOptions() pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0 pipeline_options.images_scale = 2.0

View File

@ -16,7 +16,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch") output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -19,7 +19,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch") output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -12,7 +12,8 @@ _log = logging.getLogger(__name__)
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch") output_dir = Path("scratch")
doc_converter = DocumentConverter() doc_converter = DocumentConverter()

View File

@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True pipeline_options.do_ocr = True
@ -32,7 +33,7 @@ def main():
} }
) )
doc = converter.convert(input_doc).document doc = converter.convert(input_doc_path).document
md = doc.export_to_markdown() md = doc.export_to_markdown()
print(md) print(md)

View File

@ -96,7 +96,8 @@ def watsonx_vlm_options():
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
pipeline_options = PdfPipelineOptions( pipeline_options = PdfPipelineOptions(
enable_remote_services=True # <-- this is required! enable_remote_services=True # <-- this is required!

View File

@ -10,7 +10,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
# Explicitly set the accelerator # Explicitly set the accelerator
# accelerator_options = AcceleratorOptions( # accelerator_options = AcceleratorOptions(
@ -47,7 +48,7 @@ def main():
settings.debug.profile_pipeline_timings = True settings.debug.profile_pipeline_timings = True
# Convert the document # Convert the document
conversion_result = converter.convert(input_doc) conversion_result = converter.convert(input_doc_path)
doc = conversion_result.document doc = conversion_result.document
# List with total time per document # List with total time per document

View File

@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"]) # ocr_options = TesseractOcrOptions(lang=["auto"])
@ -27,7 +28,7 @@ def main():
} }
) )
doc = converter.convert(input_doc).document doc = converter.convert(input_doc_path).document
md = doc.export_to_markdown() md = doc.export_to_markdown()
print(md) print(md)

View File

@ -30,7 +30,8 @@ def translate(text: str, src: str = "en", dest: str = "de"):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch") output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -95,8 +95,8 @@ def watsonx_vlm_options(model: str, prompt: str):
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
# input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf"
pipeline_options = VlmPipelineOptions( pipeline_options = VlmPipelineOptions(
enable_remote_services=True # <-- this is required! enable_remote_services=True # <-- this is required!

View File

@ -1,6 +1,6 @@
[project] [project]
name = "docling" name = "docling"
version = "2.36.1" # DO NOT EDIT, updated automatically version = "2.37.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT" license = "MIT"
keywords = [ keywords = [

29
tests/data/asciidoc/test_03.asciidoc vendored Normal file
View File

@ -0,0 +1,29 @@
:_mod-docs-content-type: PROCEDURE
:experimental:
[id="renaming-a-bookmark_{context}"]
= Renaming a bookmark
You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
Renaming the bookmark does not rename the folder.
.Procedure
. Right-click the bookmark in the side bar.
. Select *Rename…*.
+
image::rename-bookmark-menu.png[Rename bookmark menu]
. In the *Name* field, enter the new name for the bookmark.
+
image::rename-bookmark-text.png[Bookmark name field]
. Click btn:[Rename].
.Verification
* Check that the side bar lists the bookmark under the new name.
+
image::renamed-bookmark.png[Renamed bookmark]

BIN
tests/data/docx/word_image_anchors.docx vendored Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both*** .
Create your feature branch: `git checkout -b feature/AmazingFeature` .
1. Pull the [**repository**](https://github.com/docling-project/docling) .
2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
4. Push to the branch ( `git push origin feature/AmazingFeature` )
5. Open a Pull Request
##
*Second* section
- **First** : Lorem ipsum.
- **Second** : Dolor `sit` amet.

View File

@ -0,0 +1,565 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0'
- $ref: '#/groups/1'
- $ref: '#/groups/2'
- $ref: '#/texts/27'
- $ref: '#/groups/8'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/2'
- $ref: '#/texts/3'
- $ref: '#/texts/4'
- $ref: '#/texts/5'
- $ref: '#/texts/6'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children:
- $ref: '#/texts/7'
- $ref: '#/texts/8'
- $ref: '#/texts/9'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/1'
- children:
- $ref: '#/texts/10'
- $ref: '#/texts/14'
- $ref: '#/texts/18'
- $ref: '#/texts/22'
- $ref: '#/texts/26'
content_layer: body
label: ordered_list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/2'
- children:
- $ref: '#/texts/11'
- $ref: '#/texts/12'
- $ref: '#/texts/13'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/10'
self_ref: '#/groups/3'
- children:
- $ref: '#/texts/15'
- $ref: '#/texts/16'
- $ref: '#/texts/17'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/14'
self_ref: '#/groups/4'
- children:
- $ref: '#/texts/19'
- $ref: '#/texts/20'
- $ref: '#/texts/21'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/18'
self_ref: '#/groups/5'
- children:
- $ref: '#/texts/23'
- $ref: '#/texts/24'
- $ref: '#/texts/25'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/22'
self_ref: '#/groups/6'
- children:
- $ref: '#/texts/28'
- $ref: '#/texts/29'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/27'
self_ref: '#/groups/7'
- children:
- $ref: '#/texts/30'
- $ref: '#/texts/33'
content_layer: body
label: list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/8'
- children:
- $ref: '#/texts/31'
- $ref: '#/texts/32'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/30'
self_ref: '#/groups/9'
- children:
- $ref: '#/texts/34'
- $ref: '#/texts/35'
- $ref: '#/texts/36'
- $ref: '#/texts/37'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/33'
self_ref: '#/groups/10'
key_value_items: []
name: inline_and_formatting
origin:
binary_hash: 9342273634728023910
filename: inline_and_formatting.md
mimetype: text/markdown
pages: {}
pictures: []
schema_name: DoclingDocument
tables: []
texts:
- children: []
content_layer: body
label: title
orig: Contribution guideline example
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Contribution guideline example
- children: []
content_layer: body
label: text
orig: This is simple.
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/1'
text: This is simple.
- children: []
content_layer: body
label: text
orig: Foo
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: Foo
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/3'
text: emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: strong emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/4'
text: strong emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: true
strikethrough: false
underline: false
label: text
orig: both
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/5'
text: both
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/6'
text: .
- children: []
content_layer: body
label: text
orig: 'Create your feature branch:'
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/7'
text: 'Create your feature branch:'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/1'
prov: []
references: []
self_ref: '#/texts/8'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/9'
text: .
- children:
- $ref: '#/groups/3'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/10'
text: ''
- children: []
content_layer: body
label: text
orig: Pull the
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/11'
text: Pull the
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
hyperlink: https://github.com/docling-project/docling
label: text
orig: repository
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/12'
text: repository
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/13'
text: .
- children:
- $ref: '#/groups/4'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/14'
text: ''
- children: []
content_layer: body
label: text
orig: Create your feature branch (
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/15'
text: Create your feature branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/4'
prov: []
references: []
self_ref: '#/texts/16'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/17'
text: )
- children:
- $ref: '#/groups/5'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/18'
text: ''
- children: []
content_layer: body
label: text
orig: Commit your changes (
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/19'
text: Commit your changes (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git commit -m 'Add some AmazingFeature'
parent:
$ref: '#/groups/5'
prov: []
references: []
self_ref: '#/texts/20'
text: git commit -m 'Add some AmazingFeature'
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/21'
text: )
- children:
- $ref: '#/groups/6'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/22'
text: ''
- children: []
content_layer: body
label: text
orig: Push to the branch (
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/23'
text: Push to the branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git push origin feature/AmazingFeature
parent:
$ref: '#/groups/6'
prov: []
references: []
self_ref: '#/texts/24'
text: git push origin feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/25'
text: )
- children: []
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: Open a Pull Request
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/26'
text: Open a Pull Request
- children:
- $ref: '#/groups/7'
content_layer: body
label: section_header
level: 1
orig: ''
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/27'
text: ''
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/28'
text: Second
- children: []
content_layer: body
label: text
orig: section
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/29'
text: section
- children:
- $ref: '#/groups/9'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/30'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: First
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/31'
text: First
- children: []
content_layer: body
label: text
orig: ': Lorem ipsum.'
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/32'
text: ': Lorem ipsum.'
- children:
- $ref: '#/groups/10'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/33'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/34'
text: Second
- children: []
content_layer: body
label: text
orig: ': Dolor'
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/35'
text: ': Dolor'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: sit
parent:
$ref: '#/groups/10'
prov: []
references: []
self_ref: '#/texts/36'
text: sit
- children: []
content_layer: body
label: text
orig: amet.
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/37'
text: amet.
version: 1.3.0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: chapter: group slide-0
item-2 at level 2: title: X-Library The fully customisable ... llection exclusively for our customers

View File

@ -0,0 +1,86 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "powerpoint_bad_text",
"origin": {
"mimetype": "application/vnd.ms-powerpoint",
"binary_hash": 1443005848482130016,
"filename": "powerpoint_bad_text.pptx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "slide-0",
"label": "chapter"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "title",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 1041400.0,
"t": 4582390.0,
"r": 8083550.0,
"b": 1689099.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
118
]
}
],
"orig": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers",
"text": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers"
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 12190413.0,
"height": 6858000.0
},
"page_no": 1
}
}
}

View File

@ -0,0 +1 @@
# X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,23 @@
:\_mod-docs-content-type: PROCEDURE :experimental:
# Renaming a bookmark
[id="renaming-a-bookmark\_{context}"]
You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
Renaming the bookmark does not rename the folder.
- Check that the side bar lists the bookmark under the new name.
Procedure . Right-click the bookmark in the side bar. . Select *Rename…*. +
<!-- image -->
In the *Name* field, enter the new name for the bookmark. +
<!-- image -->
Click btn:[Rename]. .Verification
<!-- image -->

View File

@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
item-16 at level 2: list_item: Italic bullet 1 item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2 item-17 at level 2: list_item: Bold bullet 2
item-18 at level 2: list_item: Underline bullet 3 item-18 at level 2: list_item: Underline bullet 3
item-19 at level 2: inline: group group item-19 at level 2: list_item:
item-20 at level 3: list_item: Some item-20 at level 3: inline: group group
item-21 at level 3: list_item: italic item-21 at level 4: text: Some
item-22 at level 3: list_item: bold item-22 at level 4: text: italic
item-23 at level 3: list_item: underline item-23 at level 4: text: bold
item-24 at level 2: list: group list item-24 at level 4: text: underline
item-25 at level 3: inline: group group item-25 at level 2: list: group list
item-26 at level 4: list_item: Nested item-26 at level 3: list_item:
item-27 at level 4: list_item: italic item-27 at level 4: inline: group group
item-28 at level 4: list_item: bold item-28 at level 5: text: Nested
item-29 at level 1: paragraph: item-29 at level 5: text: italic
item-30 at level 5: text: bold
item-31 at level 1: paragraph:

View File

@ -42,7 +42,7 @@
"$ref": "#/groups/1" "$ref": "#/groups/1"
}, },
{ {
"$ref": "#/texts/23" "$ref": "#/texts/25"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -98,7 +98,7 @@
"$ref": "#/texts/15" "$ref": "#/texts/15"
}, },
{ {
"$ref": "#/groups/2" "$ref": "#/texts/16"
}, },
{ {
"$ref": "#/groups/3" "$ref": "#/groups/3"
@ -111,12 +111,9 @@
{ {
"self_ref": "#/groups/2", "self_ref": "#/groups/2",
"parent": { "parent": {
"$ref": "#/groups/1" "$ref": "#/texts/16"
}, },
"children": [ "children": [
{
"$ref": "#/texts/16"
},
{ {
"$ref": "#/texts/17" "$ref": "#/texts/17"
}, },
@ -125,6 +122,9 @@
}, },
{ {
"$ref": "#/texts/19" "$ref": "#/texts/19"
},
{
"$ref": "#/texts/20"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -138,7 +138,7 @@
}, },
"children": [ "children": [
{ {
"$ref": "#/groups/4" "$ref": "#/texts/21"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -148,17 +148,17 @@
{ {
"self_ref": "#/groups/4", "self_ref": "#/groups/4",
"parent": { "parent": {
"$ref": "#/groups/3" "$ref": "#/texts/21"
}, },
"children": [ "children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
{ {
"$ref": "#/texts/22" "$ref": "#/texts/22"
},
{
"$ref": "#/texts/23"
},
{
"$ref": "#/texts/24"
} }
], ],
"content_layer": "body", "content_layer": "body",
@ -461,20 +461,18 @@
{ {
"self_ref": "#/texts/16", "self_ref": "#/texts/16",
"parent": { "parent": {
"$ref": "#/groups/2" "$ref": "#/groups/1"
}, },
"children": [], "children": [
{
"$ref": "#/groups/2"
}
],
"content_layer": "body", "content_layer": "body",
"label": "list_item", "label": "list_item",
"prov": [], "prov": [],
"orig": "Some", "orig": "",
"text": "Some", "text": "",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false, "enumerated": false,
"marker": "-" "marker": "-"
}, },
@ -485,18 +483,16 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "list_item", "label": "text",
"prov": [], "prov": [],
"orig": "italic", "orig": "Some",
"text": "italic", "text": "Some",
"formatting": { "formatting": {
"bold": false, "bold": false,
"italic": true, "italic": false,
"underline": false, "underline": false,
"strikethrough": false "strikethrough": false
}, }
"enumerated": false,
"marker": "-"
}, },
{ {
"self_ref": "#/texts/18", "self_ref": "#/texts/18",
@ -505,67 +501,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "list_item", "label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [], "prov": [],
"orig": "italic", "orig": "italic",
"text": "italic", "text": "italic",
@ -574,7 +510,59 @@
"italic": true, "italic": true,
"underline": false, "underline": false,
"strikethrough": false "strikethrough": false
}
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
}, },
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/3"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false, "enumerated": false,
"marker": "-" "marker": "-"
}, },
@ -585,7 +573,43 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "list_item", "label": "text",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [], "prov": [],
"orig": "bold", "orig": "bold",
"text": "bold", "text": "bold",
@ -594,12 +618,10 @@
"italic": false, "italic": false,
"underline": false, "underline": false,
"strikethrough": false "strikethrough": false
}, }
"enumerated": false,
"marker": "-"
}, },
{ {
"self_ref": "#/texts/23", "self_ref": "#/texts/25",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/body"
}, },

View File

@ -0,0 +1,16 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Transcript
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
item-3 at level 1: picture
item-4 at level 1: inline: group group
item-5 at level 2: paragraph: This is test 1
item-6 at level 2: paragraph: 0:08
Correct, he is not.
item-7 at level 1: paragraph:
item-8 at level 1: picture
item-9 at level 1: inline: group group
item-10 at level 2: paragraph: This is test 2
item-11 at level 2: paragraph: 0:16
Yeah, exactly.
item-12 at level 1: paragraph:
item-13 at level 1: paragraph:

View File

@ -0,0 +1,286 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "word_image_anchors",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 2428692234257307633,
"filename": "word_image_anchors.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/pictures/1"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Transcript",
"text": "Transcript",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "February 20, 2025, 8:32PM",
"text": "February 20, 2025, 8:32PM",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is test 1",
"text": "This is test 1",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "0:08\nCorrect, he is not.",
"text": "0:08\nCorrect, he is not.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is test 2",
"text": "This is test 2",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "0:16\nYeah, exactly.",
"text": "0:16\nYeah, exactly.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 72,
"size": {
"width": 100.0,
"height": 100.0
},
"uri": ""
},
"annotations": []
},
{
"self_ref": "#/pictures/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 72,
"size": {
"width": 100.0,
"height": 100.0
},
"uri": ""
},
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,13 @@
**Transcript**
February 20, 2025, 8:32PM
<!-- image -->
**This is test 1** 0:08
Correct, he is not.
<!-- image -->
**This is test 2** 0:16
Yeah, exactly.

18
tests/data/md/inline_and_formatting.md vendored Normal file
View File

@ -0,0 +1,18 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both***.
Create your feature branch: `git checkout -b feature/AmazingFeature`.
1. Pull the [**repository**](https://github.com/docling-project/docling).
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
4. Push to the branch (`git push origin feature/AmazingFeature`)
5. Open a Pull Request
## *Second* section <!-- inline groups in headings not yet supported by serializers -->
- **First**: Lorem ipsum.
- **Second**: Dolor `sit` amet.

BIN
tests/data/pptx/powerpoint_bad_text.pptx vendored Normal file

Binary file not shown.

View File

@ -5,84 +5,159 @@
"width": 2000.0, "width": 2000.0,
"height": 2829.0 "height": 2829.0
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 246.4065456254215, "r_x0": 0.0,
"r_y0": 329.06770715202435, "r_y0": 0.0,
"r_x1": 1691.991797818404, "r_x1": 2000.0,
"r_y1": 329.06770715202435, "r_y1": 0.0,
"r_x2": 1691.991797818404, "r_x2": 2000.0,
"r_y2": 258.9040166758338, "r_y2": 2829.0,
"r_x3": 246.4065456254215, "r_x3": 0.0,
"r_y3": 258.9040166758338, "r_y3": 2829.0,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 2829.0,
"from_ocr": true "r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [
"index": 1, {
"rgba": { "index": 0,
"r": 0, "rect": {
"g": 0, "r_x0": 0.0,
"b": 0, "r_y0": 0.0,
"a": 255 "r_x1": 2000.0,
"r_y1": 0.0,
"r_x2": 2000.0,
"r_y2": 2829.0,
"r_x3": 0.0,
"r_y3": 2829.0,
"coord_origin": "BOTTOMLEFT"
},
"uri": null
}
],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 246.4065456254215,
"r_y0": 329.06770715202435,
"r_x1": 1691.991797818404,
"r_y1": 329.06770715202435,
"r_x2": 1691.991797818404,
"r_y2": 258.9040166758338,
"r_x3": 246.4065456254215,
"r_y3": 258.9040166758338,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 234.08627147881114, "index": 1,
"r_y0": 419.5788697734327, "rgba": {
"r_x1": 1696.0985042090742, "r": 0,
"r_y1": 419.5788697734327, "g": 0,
"r_x2": 1696.0985042090742, "b": 0,
"r_y2": 349.4151792972422, "a": 255
"r_x3": 234.08627147881114, },
"r_y3": 349.4151792972422, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 242.29979922858777,
"g": 0, "r_y0": 509.8779072023336,
"b": 0, "r_x1": 513.3470125989277,
"a": 255 "r_y1": 509.8779072023336,
}, "r_x2": 513.3470125989277,
"rect": { "r_y2": 439.9752910477536,
"r_x0": 242.29979922858777, "r_x3": 242.29979922858777,
"r_y0": 509.8779072023336, "r_y3": 439.9752910477536,
"r_x1": 513.3470125989277, "coord_origin": "TOPLEFT"
"r_y1": 509.8779072023336, },
"r_x2": 513.3470125989277, "text": "package",
"r_y2": 439.9752910477536, "orig": "package",
"r_x3": 242.29979922858777, "text_direction": "left_to_right",
"r_y3": 439.9752910477536, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 73.34702132031646, "r_x0": 0.0,
"r_y0": 97.99999977896755, "r_y0": 0.0,
"r_x1": 503.64955224479564, "r_x1": 595.201171875,
"r_y1": 97.99999977896755, "r_y1": 0.0,
"r_x2": 503.64955224479564, "r_x2": 595.201171875,
"r_y2": 76.99999977896756, "r_y2": 841.9216918945312,
"r_x3": 73.34702132031646, "r_x3": 0.0,
"r_y3": 76.99999977896756, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 69.6796630536824, "index": 1,
"r_y0": 124.83139494707741, "rgba": {
"r_x1": 504.8720051760782, "r": 0,
"r_y1": 124.83139494707741, "g": 0,
"r_x2": 504.8720051760782, "b": 0,
"r_y2": 104.00000011573796, "a": 255
"r_x3": 69.6796630536824, },
"r_y3": 104.00000011573796, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 71.84193505100733,
"g": 0, "r_y0": 152.90926970226084,
"b": 0, "r_x1": 153.088934155825,
"a": 255 "r_y1": 152.90926970226084,
}, "r_x2": 153.088934155825,
"rect": { "r_y2": 129.797125232046,
"r_x0": 71.84193505100733, "r_x3": 71.84193505100733,
"r_y0": 152.90926970226084, "r_y3": 129.797125232046,
"r_x1": 153.088934155825, "coord_origin": "TOPLEFT"
"r_y1": 152.90926970226084, },
"r_x2": 153.088934155825, "text": "package",
"r_y2": 129.797125232046, "orig": "package",
"r_x3": 71.84193505100733, "text_direction": "left_to_right",
"r_y3": 129.797125232046, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 89.2388782764286, "r_x0": 0.0,
"r_y0": 764.898293373551, "r_y0": 0.0,
"r_x1": 521.9863147998661, "r_x1": 595.201171875,
"r_y1": 764.898293373551, "r_y1": 0.0,
"r_x2": 521.9863147998661, "r_x2": 595.201171875,
"r_y2": 744.0929853494625, "r_y2": 841.9216918945312,
"r_x3": 89.2388782764286, "r_x3": 0.0,
"r_y3": 744.0929853494625, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 89.23887497045128, "index": 1,
"r_y0": 739.1977118987292, "rgba": {
"r_x1": 523.208764293368, "r": 0,
"r_y1": 739.1977118987292, "g": 0,
"r_x2": 523.208764293368, "b": 0,
"r_y2": 717.1685676116198, "a": 255
"r_x3": 89.23887497045128, },
"r_y3": 717.1685676116198, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 441.2561096985719,
"g": 0, "r_y0": 710.0268078458798,
"b": 0, "r_x1": 522.0347860494834,
"a": 255 "r_y1": 710.0268078458798,
}, "r_x2": 522.0347860494834,
"rect": { "r_y2": 690.0429592741025,
"r_x0": 441.2561096985719, "r_x3": 441.2561096985719,
"r_y0": 710.0268078458798, "r_y3": 690.0429592741025,
"r_x1": 522.0347860494834, "coord_origin": "TOPLEFT"
"r_y1": 710.0268078458798, },
"r_x2": 522.0347860494834, "text": "package",
"r_y2": 690.0429592741025, "orig": "package",
"r_x3": 441.2561096985719, "text_direction": "left_to_right",
"r_y3": 690.0429592741025, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 744.0930045534915, "r_x0": 0.0,
"r_y0": 504.87200373583954, "r_y0": 0.0,
"r_x1": 764.8982839673505, "r_x1": 595.201171875,
"r_y1": 504.87200373583954, "r_y1": 0.0,
"r_x2": 764.8982839673505, "r_x2": 595.201171875,
"r_y2": 73.34702001188118, "r_y2": 841.9216918945312,
"r_x3": 744.0930045534915, "r_x3": 0.0,
"r_y3": 73.34702001188118, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 717.168585936602, "index": 1,
"r_y0": 504.8720061466397, "rgba": {
"r_x1": 737.9738558137178, "r": 0,
"r_y1": 504.8720061466397, "g": 0,
"r_x2": 737.9738558137178, "b": 0,
"r_y2": 70.90211682372312, "a": 255
"r_x3": 717.168585936602, },
"r_y3": 70.90211682372312, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 690.2441821046808,
"g": 0, "r_y0": 152.80629773131633,
"b": 0, "r_x1": 709.8255852011977,
"a": 255 "r_y1": 152.80629773131633,
}, "r_x2": 709.8255852011977,
"rect": { "r_y2": 72.124570639845,
"r_x0": 690.2441821046808, "r_x3": 690.2441821046808,
"r_y0": 152.80629773131633, "r_y3": 72.124570639845,
"r_x1": 709.8255852011977, "coord_origin": "TOPLEFT"
"r_y1": 152.80629773131633, },
"r_x2": 709.8255852011977, "text": "package",
"r_y2": 72.124570639845, "orig": "package",
"r_x3": 690.2441821046808, "text_direction": "left_to_right",
"r_y3": 72.124570639845, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 77.10171545548258, "r_x0": 0.0,
"r_y0": 520.7638571913312, "r_y0": 0.0,
"r_x1": 96.68315797053792, "r_x1": 595.201171875,
"r_y1": 520.7638571913312, "r_y1": 0.0,
"r_x2": 96.68315797053792, "r_x2": 595.201171875,
"r_y2": 89.2388734673729, "r_y2": 841.9216918945312,
"r_x3": 77.10171545548258, "r_x3": 0.0,
"r_y3": 89.2388734673729, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 100.64168123325977, "index": 1,
"r_y0": 523.3236155182395, "rgba": {
"r_x1": 126.08064862014129, "r": 0,
"r_y1": 523.3236155182395, "g": 0,
"r_x2": 126.08064862014129, "b": 0,
"r_y2": 89.1266754140729, "a": 255
"r_x3": 100.64168123325977, },
"r_y3": 89.1266754140729, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 131.21306574279092,
"g": 0, "r_y0": 521.0762158417759,
"b": 0, "r_x1": 152.19606490864376,
"a": 255 "r_y1": 521.0762158417759,
}, "r_x2": 152.19606490864376,
"rect": { "r_y2": 441.0071698212682,
"r_x0": 131.21306574279092, "r_x3": 131.21306574279092,
"r_y0": 521.0762158417759, "r_y3": 441.0071698212682,
"r_x1": 152.19606490864376, "coord_origin": "TOPLEFT"
"r_y1": 521.0762158417759, },
"r_x2": 152.19606490864376, "text": "package",
"r_y2": 441.0071698212682, "orig": "package",
"r_x3": 131.21306574279092, "text_direction": "left_to_right",
"r_y3": 441.0071698212682, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 73.34702132031646, "r_x0": 0.0,
"r_y0": 97.99999977896755, "r_y0": 0.0,
"r_x1": 503.64955224479564, "r_x1": 595.201171875,
"r_y1": 97.99999977896755, "r_y1": 0.0,
"r_x2": 503.64955224479564, "r_x2": 595.201171875,
"r_y2": 76.99999977896756, "r_y2": 841.9216918945312,
"r_x3": 73.34702132031646, "r_x3": 0.0,
"r_y3": 76.99999977896756, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 69.6796630536824, "index": 1,
"r_y0": 124.83139494707741, "rgba": {
"r_x1": 504.8720051760782, "r": 0,
"r_y1": 124.83139494707741, "g": 0,
"r_x2": 504.8720051760782, "b": 0,
"r_y2": 104.00000011573796, "a": 255
"r_x3": 69.6796630536824, },
"r_y3": 104.00000011573796, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 71.84193505100733,
"g": 0, "r_y0": 152.90926970226084,
"b": 0, "r_x1": 153.088934155825,
"a": 255 "r_y1": 152.90926970226084,
}, "r_x2": 153.088934155825,
"rect": { "r_y2": 129.797125232046,
"r_x0": 71.84193505100733, "r_x3": 71.84193505100733,
"r_y0": 152.90926970226084, "r_y3": 129.797125232046,
"r_x1": 153.088934155825, "coord_origin": "TOPLEFT"
"r_y1": 152.90926970226084, },
"r_x2": 153.088934155825, "text": "package",
"r_y2": 129.797125232046, "orig": "package",
"r_x3": 71.84193505100733, "text_direction": "left_to_right",
"r_y3": 129.797125232046, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 89.2388782764286, "r_x0": 0.0,
"r_y0": 764.898293373551, "r_y0": 0.0,
"r_x1": 521.9863147998661, "r_x1": 595.201171875,
"r_y1": 764.898293373551, "r_y1": 0.0,
"r_x2": 521.9863147998661, "r_x2": 595.201171875,
"r_y2": 744.0929853494625, "r_y2": 841.9216918945312,
"r_x3": 89.2388782764286, "r_x3": 0.0,
"r_y3": 744.0929853494625, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 89.23887497045128, "index": 1,
"r_y0": 739.1977118987292, "rgba": {
"r_x1": 523.208764293368, "r": 0,
"r_y1": 739.1977118987292, "g": 0,
"r_x2": 523.208764293368, "b": 0,
"r_y2": 717.1685676116198, "a": 255
"r_x3": 89.23887497045128, },
"r_y3": 717.1685676116198, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 441.2561096985719,
"g": 0, "r_y0": 710.0268078458798,
"b": 0, "r_x1": 522.0347860494834,
"a": 255 "r_y1": 710.0268078458798,
}, "r_x2": 522.0347860494834,
"rect": { "r_y2": 690.0429592741025,
"r_x0": 441.2561096985719, "r_x3": 441.2561096985719,
"r_y0": 710.0268078458798, "r_y3": 690.0429592741025,
"r_x1": 522.0347860494834, "coord_origin": "TOPLEFT"
"r_y1": 710.0268078458798, },
"r_x2": 522.0347860494834, "text": "package",
"r_y2": 690.0429592741025, "orig": "package",
"r_x3": 441.2561096985719, "text_direction": "left_to_right",
"r_y3": 690.0429592741025, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 744.0930045534915, "r_x0": 0.0,
"r_y0": 504.87200373583954, "r_y0": 0.0,
"r_x1": 764.8982839673505, "r_x1": 595.201171875,
"r_y1": 504.87200373583954, "r_y1": 0.0,
"r_x2": 764.8982839673505, "r_x2": 595.201171875,
"r_y2": 73.34702001188118, "r_y2": 841.9216918945312,
"r_x3": 744.0930045534915, "r_x3": 0.0,
"r_y3": 73.34702001188118, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 717.168585936602, "index": 1,
"r_y0": 504.8720061466397, "rgba": {
"r_x1": 737.9738558137178, "r": 0,
"r_y1": 504.8720061466397, "g": 0,
"r_x2": 737.9738558137178, "b": 0,
"r_y2": 70.90211682372312, "a": 255
"r_x3": 717.168585936602, },
"r_y3": 70.90211682372312, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 690.2441821046808,
"g": 0, "r_y0": 152.80629773131633,
"b": 0, "r_x1": 709.8255852011977,
"a": 255 "r_y1": 152.80629773131633,
}, "r_x2": 709.8255852011977,
"rect": { "r_y2": 72.124570639845,
"r_x0": 690.2441821046808, "r_x3": 690.2441821046808,
"r_y0": 152.80629773131633, "r_y3": 72.124570639845,
"r_x1": 709.8255852011977, "coord_origin": "TOPLEFT"
"r_y1": 152.80629773131633, },
"r_x2": 709.8255852011977, "text": "package",
"r_y2": 72.124570639845, "orig": "package",
"r_x3": 690.2441821046808, "text_direction": "left_to_right",
"r_y3": 72.124570639845, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 77.10171545548258, "r_x0": 0.0,
"r_y0": 520.7638571913312, "r_y0": 0.0,
"r_x1": 96.68315797053792, "r_x1": 595.201171875,
"r_y1": 520.7638571913312, "r_y1": 0.0,
"r_x2": 96.68315797053792, "r_x2": 595.201171875,
"r_y2": 89.2388734673729, "r_y2": 841.9216918945312,
"r_x3": 77.10171545548258, "r_x3": 0.0,
"r_y3": 89.2388734673729, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 100.64168123325977, "index": 1,
"r_y0": 523.3236155182395, "rgba": {
"r_x1": 126.08064862014129, "r": 0,
"r_y1": 523.3236155182395, "g": 0,
"r_x2": 126.08064862014129, "b": 0,
"r_y2": 89.1266754140729, "a": 255
"r_x3": 100.64168123325977, },
"r_y3": 89.1266754140729, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 131.21306574279092,
"g": 0, "r_y0": 521.0762158417759,
"b": 0, "r_x1": 152.19606490864376,
"a": 255 "r_y1": 521.0762158417759,
}, "r_x2": 152.19606490864376,
"rect": { "r_y2": 441.0071698212682,
"r_x0": 131.21306574279092, "r_x3": 131.21306574279092,
"r_y0": 521.0762158417759, "r_y3": 441.0071698212682,
"r_x1": 152.19606490864376, "coord_origin": "TOPLEFT"
"r_y1": 521.0762158417759, },
"r_x2": 152.19606490864376, "text": "package",
"r_y2": 441.0071698212682, "orig": "package",
"r_x3": 131.21306574279092, "text_direction": "left_to_right",
"r_y3": 441.0071698212682, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -2,7 +2,11 @@ import glob
import os import os
from pathlib import Path from pathlib import Path
from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.asciidoc_backend import (
DEFAULT_IMAGE_HEIGHT,
DEFAULT_IMAGE_WIDTH,
AsciiDocBackend,
)
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -18,6 +22,24 @@ def _get_backend(fname):
return doc_backend return doc_backend
def test_parse_picture():
line = (
"image::images/example1.png[Example Image, width=200, height=150, align=center]"
)
res = AsciiDocBackend._parse_picture(line)
assert res
assert res.get("width", 0) == "200"
assert res.get("height", 0) == "150"
assert res.get("uri", "") == "images/example1.png"
line = "image::renamed-bookmark.png[Renamed bookmark]"
res = AsciiDocBackend._parse_picture(line)
assert res
assert "width" not in res
assert "height" not in res
assert res.get("uri", "") == "renamed-bookmark.png"
def test_asciidocs_examples(): def test_asciidocs_examples():
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc")) fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))

View File

@ -2,7 +2,7 @@ from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import DoclingDocument, InputDocument
from .test_data_gen_flag import GEN_TEST_DATA from .test_data_gen_flag import GEN_TEST_DATA
@ -11,12 +11,15 @@ def test_convert_valid():
fmt = InputFormat.MD fmt = InputFormat.MD
cls = MarkdownDocumentBackend cls = MarkdownDocumentBackend
test_data_path = Path("tests") / "data" root_path = Path("tests") / "data"
relevant_paths = sorted((test_data_path / "md").rglob("*.md")) relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0 assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"]
for in_path in relevant_paths: for in_path in relevant_paths:
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=in_path, path_or_stream=in_path,
@ -33,9 +36,17 @@ def test_convert_valid():
act_data = act_doc.export_to_markdown() act_data = act_doc.export_to_markdown()
if GEN_TEST_DATA: if GEN_TEST_DATA:
with open(gt_path, mode="w", encoding="utf-8") as f: with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n") f.write(f"{act_data}\n")
if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
act_doc.save_as_yaml(yaml_gt_path)
else: else:
with open(gt_path, encoding="utf-8") as f: with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip() exp_data = f.read().rstrip()
assert exp_data == act_data assert act_data == exp_data
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc

View File

@ -9,6 +9,7 @@ from docling.datamodel.document import (
DoclingDocument, DoclingDocument,
InputDocument, InputDocument,
SectionHeaderItem, SectionHeaderItem,
TextItem,
) )
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md" f"export to markdown failed on {docx_path}"
) )
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text" f"export to indented-text failed on {docx_path}"
) )
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
"document document" f"DoclingDocument verification failed on {docx_path}"
) )
if docx_path.name == "word_tables.docx": if docx_path.name == "word_tables.docx":
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_text=pred_html, pred_text=pred_html,
gtfile=str(gt_path) + ".html", gtfile=str(gt_path) + ".html",
generate=GENERATE, generate=GENERATE,
), "export to html" ), f"export to html failed on {docx_path}"
flaky_path = Path("tests/data/docx/textbox.docx") flaky_path = Path("tests/data/docx/textbox.docx")
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
@pytest.mark.xfail(strict=False) @pytest.mark.xfail(strict=False)
def test_textbox_conversion(): def test_textbox_conversion():
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
def test_text_after_image_anchors():
"""
Test to analyse whether text gets parsed after image anchors.
"""
in_path = Path("tests/data/docx/word_image_anchors.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_text_after_anchor_1 = found_text_after_anchor_2 = (
found_text_after_anchor_3
) = found_text_after_anchor_4 = False
for item, _ in doc.iterate_items():
if isinstance(item, TextItem):
if item.text == "This is test 1":
found_text_after_anchor_1 = True
elif item.text == "0:08\nCorrect, he is not.":
found_text_after_anchor_2 = True
elif item.text == "This is test 2":
found_text_after_anchor_3 = True
elif item.text == "0:16\nYeah, exactly.":
found_text_after_anchor_4 = True
assert (
found_text_after_anchor_1
and found_text_after_anchor_2
and found_text_after_anchor_3
and found_text_after_anchor_4
)

View File

@ -60,3 +60,25 @@ def test_code_and_formula_conversion():
gt = "a ^ { 2 } + 8 = 1 2" gt = "a ^ { 2 } + 8 = 1 2"
predicted = formula_blocks[0].text predicted = formula_blocks[0].text
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}" assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
def test_formula_conversion_with_page_range():
pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
converter = get_converter()
print(f"converting {pdf_path} with page range")
doc_result: ConversionResult = converter.convert(pdf_path, page_range=(2, 2))
results = doc_result.document.texts
formula_blocks = [
el
for el in results
if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA
]
assert len(formula_blocks) == 1
gt = "a ^ { 2 } + 8 = 1 2"
predicted = formula_blocks[0].text
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"

View File

@ -57,14 +57,14 @@ def test_e2e_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
engines: List[Tuple[OcrOptions, bool]] = [ engines: List[Tuple[OcrOptions, bool]] = [
(EasyOcrOptions(), False),
(TesseractOcrOptions(), True), (TesseractOcrOptions(), True),
(TesseractCliOcrOptions(), True), (TesseractCliOcrOptions(), True),
(EasyOcrOptions(force_full_page_ocr=True), False), (EasyOcrOptions(), False),
(TesseractOcrOptions(force_full_page_ocr=True), True), (TesseractOcrOptions(force_full_page_ocr=True), True),
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True), (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(TesseractCliOcrOptions(force_full_page_ocr=True), True), (TesseractCliOcrOptions(force_full_page_ocr=True), True),
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True), (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
] ]
# rapidocr is only available for Python >=3.6,<3.13 # rapidocr is only available for Python >=3.6,<3.13

2
uv.lock generated
View File

@ -818,7 +818,7 @@ wheels = [
[[package]] [[package]]
name = "docling" name = "docling"
version = "2.36.1" version = "2.37.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "beautifulsoup4" }, { name = "beautifulsoup4" },