Merge branch 'main' of github.com:DS4SD/docling into dev/add-asr-pipeline-v2

This commit is contained in:
Christoph Auer 2025-06-23 09:08:58 +02:00
commit caf18e634b
98 changed files with 340943 additions and 330462 deletions

2
.github/dco.yml vendored Normal file
View File

@ -0,0 +1,2 @@
allowRemediationCommits:
individual: true

192
.github/workflows/dco-advisor.yml vendored Normal file
View File

@ -0,0 +1,192 @@
name: DCO Advisor Bot
on:
pull_request_target:
types: [opened, reopened, synchronize]
permissions:
pull-requests: write
issues: write
jobs:
dco_advisor:
runs-on: ubuntu-latest
steps:
- name: Handle DCO check result
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const pr = context.payload.pull_request || context.payload.check_run?.pull_requests?.[0];
if (!pr) return;
const prNumber = pr.number;
const baseRef = pr.base.ref;
const headSha =
context.payload.check_run?.head_sha ||
pr.head?.sha;
const username = pr.user.login;
console.log("HEAD SHA:", headSha);
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
// Poll until DCO check has a conclusion (max 6 attempts, 30s)
let dcoCheck = null;
for (let attempt = 0; attempt < 6; attempt++) {
const { data: checks } = await github.rest.checks.listForRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: headSha
});
console.log("All check runs:");
checks.check_runs.forEach(run => {
console.log(`- ${run.name} (${run.status}/${run.conclusion}) @ ${run.head_sha}`);
});
dcoCheck = checks.check_runs.find(run =>
run.name.toLowerCase().includes("dco") &&
!run.name.toLowerCase().includes("dco_advisor") &&
run.head_sha === headSha
);
if (dcoCheck?.conclusion) break;
console.log(`Waiting for DCO check... (${attempt + 1})`);
await sleep(5000); // wait 5 seconds
}
if (!dcoCheck || !dcoCheck.conclusion) {
console.log("DCO check did not complete in time.");
return;
}
const isFailure = ["failure", "action_required"].includes(dcoCheck.conclusion);
console.log(`DCO check conclusion for ${headSha}: ${dcoCheck.conclusion} (treated as ${isFailure ? "failure" : "success"})`);
// Parse DCO output for commit SHAs and author
let badCommits = [];
let authorName = "";
let authorEmail = "";
let moreInfo = `More info: [DCO check report](${dcoCheck?.html_url})`;
if (isFailure) {
const { data: commits } = await github.rest.pulls.listCommits({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: prNumber,
});
for (const commit of commits) {
const commitMessage = commit.commit.message;
const signoffMatch = commitMessage.match(/^Signed-off-by:\s+.+<.+>$/m);
if (!signoffMatch) {
console.log(`Bad commit found ${commit.sha}`)
badCommits.push({
sha: commit.sha,
authorName: commit.commit.author.name,
authorEmail: commit.commit.author.email,
});
}
}
}
// If multiple authors are present, you could adapt the message accordingly
// For now, we'll just use the first one
if (badCommits.length > 0) {
authorName = badCommits[0].authorName;
authorEmail = badCommits[0].authorEmail;
}
// Generate remediation commit message if needed
let remediationSnippet = "";
if (badCommits.length && authorEmail) {
remediationSnippet = `git commit --allow-empty -s -m "DCO Remediation Commit for ${authorName} <${authorEmail}>\n\n` +
badCommits.map(c => `I, ${c.authorName} <${c.authorEmail}>, hereby add my Signed-off-by to this commit: ${c.sha}`).join('\n') +
`"`;
} else {
remediationSnippet = "# Unable to auto-generate remediation message. Please check the DCO check details.";
}
// Build comment
const commentHeader = '<!-- dco-advice-bot -->';
let body = "";
if (isFailure) {
body = [
commentHeader,
'❌ **DCO Check Failed**',
'',
`Hi @${username}, your pull request has failed the Developer Certificate of Origin (DCO) check.`,
'',
'This repository supports **remediation commits**, so you can fix this without rewriting history — but you must follow the required message format.',
'',
'---',
'',
'### 🛠 Quick Fix: Add a remediation commit',
'Run this command:',
'',
'```bash',
remediationSnippet,
'git push',
'```',
'',
'---',
'',
'<details>',
'<summary>🔧 Advanced: Sign off each commit directly</summary>',
'',
'**For the latest commit:**',
'```bash',
'git commit --amend --signoff',
'git push --force-with-lease',
'```',
'',
'**For multiple commits:**',
'```bash',
`git rebase --signoff origin/${baseRef}`,
'git push --force-with-lease',
'```',
'',
'</details>',
'',
moreInfo
].join('\n');
} else {
body = [
commentHeader,
'✅ **DCO Check Passed**',
'',
`Thanks @${username}, all your commits are properly signed off. 🎉`
].join('\n');
}
// Get existing comments on the PR
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber
});
// Look for a previous bot comment
const existingComment = comments.find(c =>
c.body.includes("<!-- dco-advice-bot -->")
);
if (existingComment) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existingComment.id,
body: body
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: body
});
}

View File

@ -1,3 +1,26 @@
## [v2.37.0](https://github.com/docling-project/docling/releases/tag/v2.37.0) - 2025-06-16
### Feature
* Make Page.parsed_page the only source of truth for text cells, add OCR cells to it ([#1745](https://github.com/docling-project/docling/issues/1745)) ([`7d3302c`](https://github.com/docling-project/docling/commit/7d3302cb48dd91cd29673d7c4eaf7326736d0685))
* Support xlsm files ([#1520](https://github.com/docling-project/docling/issues/1520)) ([`df14022`](https://github.com/docling-project/docling/commit/df140227c3b8bcad0c68bf3d129930cccd96a07e))
### Fix
* Pptx line break and space handling ([#1664](https://github.com/docling-project/docling/issues/1664)) ([`f28d23c`](https://github.com/docling-project/docling/commit/f28d23cf03d059619d1d3482594596ab7c87d197))
* **asciidoc:** Set default size when missing in image directive ([#1769](https://github.com/docling-project/docling/issues/1769)) ([`b886e4d`](https://github.com/docling-project/docling/commit/b886e4df312447d39f58cf6e3c45b0f863940321))
* Handle NoneType error in MsPowerpointDocumentBackend ([#1747](https://github.com/docling-project/docling/issues/1747)) ([`7a275c7`](https://github.com/docling-project/docling/commit/7a275c763731d9c96b7cf32f2e27b8dc8bebacd7))
* Prov for merged-elems ([#1728](https://github.com/docling-project/docling/issues/1728)) ([`6613b9e`](https://github.com/docling-project/docling/commit/6613b9e98bc8b89791dc0334de8970ff243aba82))
* **tesseract:** Initialize df_osd to avoid uninitialized variable error ([#1718](https://github.com/docling-project/docling/issues/1718)) ([`e979750`](https://github.com/docling-project/docling/commit/e979750ce93b2fae89dbb60ff06333f80c1c2908))
* Allow custom torch_dtype in vlm models ([#1735](https://github.com/docling-project/docling/issues/1735)) ([`f7f3113`](https://github.com/docling-project/docling/commit/f7f31137f10999fefdb70da7e5ef56536f650400))
* Improve extraction from textboxes in Word docs ([#1701](https://github.com/docling-project/docling/issues/1701)) ([`9dbcb3d`](https://github.com/docling-project/docling/commit/9dbcb3d7d4f27d1c935c8681c57ed59524452d53))
* Add WEBP to the list of image file extensions ([#1711](https://github.com/docling-project/docling/issues/1711)) ([`a2b83fe`](https://github.com/docling-project/docling/commit/a2b83fe4aea66c273a83bf17177e87d45d3f18d1))
### Documentation
* Update vlm models api examples with LM Studio ([#1759](https://github.com/docling-project/docling/issues/1759)) ([`0432a31`](https://github.com/docling-project/docling/commit/0432a31b2f7c9fe944c3a1d4b608ef938b4f2299))
* Add open webui ([#1734](https://github.com/docling-project/docling/issues/1734)) ([`49b10e7`](https://github.com/docling-project/docling/commit/49b10e74191d4d580c9305ac08d9898a79346d7d))
## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04
### Fix

View File

@ -2,7 +2,7 @@ import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from typing import Final, Set, Union
from docling_core.types.doc import (
DocItemLabel,
@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
DEFAULT_IMAGE_WIDTH: Final = 128
DEFAULT_IMAGE_HEIGHT: Final = 128
class AsciiDocBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
item = self._parse_picture(line)
size = None
size: Size
if "width" in item and "height" in item:
size = Size(width=int(item["width"]), height=int(item["height"]))
else:
size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)
uri = None
if (
@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return doc
def _get_current_level(self, parents):
@staticmethod
def _get_current_level(parents):
for k, v in parents.items():
if v is None and k > 0:
return k - 1
return 0
def _get_current_parent(self, parents):
@staticmethod
def _get_current_parent(parents):
for k, v in parents.items():
if v is None and k > 0:
return parents[k - 1]
@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return None
# ========= Title
def _is_title(self, line):
@staticmethod
def _is_title(line):
return re.match(r"^= ", line)
def _parse_title(self, line):
@staticmethod
def _parse_title(line):
return {"type": "title", "text": line[2:].strip(), "level": 0}
# ========= Section headers
def _is_section_header(self, line):
@staticmethod
def _is_section_header(line):
return re.match(r"^==+\s+", line)
def _parse_section_header(self, line):
@staticmethod
def _parse_section_header(line):
match = re.match(r"^(=+)\s+(.*)", line)
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
}
# ========= Lists
def _is_list_item(self, line):
@staticmethod
def _is_list_item(line):
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
def _parse_list_item(self, line):
@staticmethod
def _parse_list_item(line):
"""Extract the item marker (number or bullet symbol) and the text of the item."""
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
}
# ========= Tables
def _is_table_line(self, line):
@staticmethod
def _is_table_line(line):
return re.match(r"^\|.*\|", line)
def _parse_table_line(self, line):
@staticmethod
def _parse_table_line(line):
# Split table cells and trim extra spaces
return [cell.strip() for cell in line.split("|") if cell.strip()]
def _populate_table_as_grid(self, table_data):
@staticmethod
def _populate_table_as_grid(table_data):
num_rows = len(table_data)
# Adjust the table data into a grid format
@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return data
# ========= Pictures
def _is_picture(self, line):
@staticmethod
def _is_picture(line):
return re.match(r"^image::", line)
def _parse_picture(self, line):
@staticmethod
def _parse_picture(line):
"""
Parse an image macro, extracting its path and attributes.
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return {"type": "picture", "uri": line}
# ========= Captions
def _is_caption(self, line):
@staticmethod
def _is_caption(line):
return re.match(r"^\.(.+)", line)
def _parse_caption(self, line):
@staticmethod
def _parse_caption(line):
mtch = re.match(r"^\.(.+)", line)
if mtch:
text = mtch.group(1)
@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return {"type": "caption", "text": ""}
# ========= Plain text
def _parse_text(self, line):
@staticmethod
def _parse_text(line):
return {"type": "text", "text": line.strip()}

View File

@ -7,12 +7,17 @@ from typing import List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_core.types.doc.page import (
BoundingRectangle,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
def is_valid(self) -> bool:
return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid:
return ""
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_lines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge:
# draw_clusters_and_cells()
# cells = merge_horizontal_cells(cells)
# after merge:
# draw_clusters_and_cells()
return cells
return self._compute_text_cells()
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32

View File

@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def is_valid(self) -> bool:
return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse v2 data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid:
return ""
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
return cells
return self._compute_text_cells()
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32

View File

@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
return self._dpage
def get_text_cells(self) -> Iterable[TextCell]:
page_size = self.get_size()
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
# for cell in self._dpage.textline_cells:
# rect = cell.rect
#
# assert (
# rect.to_bounding_box().l <= rect.to_bounding_box().r
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
# assert (
# rect.to_bounding_box().t <= rect.to_bounding_box().b
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
return self._dpage.textline_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend:
with pypdfium2_lock:
return DoclingParseV4PageBackend(
self.dp_doc.get_page(
seg_page = self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
),
)
# In Docling, all TextCell instances are expected with top-left origin.
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.textline_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.char_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.word_cells
]
return DoclingParseV4PageBackend(
seg_page,
self._pdoc[page_no],
)

View File

@ -1,17 +1,15 @@
import logging
import re
import warnings
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Set, Union
import marko
import marko.element
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
@ -21,7 +19,9 @@ from docling_core.types.doc import (
TableData,
TextItem,
)
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from marko import Markdown
from pydantic import AnyUrl, TypeAdapter
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False
self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0
try:
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data)
return
def _process_inline_text(
self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_item,
text=txt,
)
self.inline_texts = []
def _iterate_elements( # noqa: C901
self,
*,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
):
if element in visited:
return
@ -183,43 +173,31 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if element.level == 1:
doc_label = DocItemLabel.TITLE
if len(element.children) == 1:
child = element.children[0]
snippet_text = str(child.children) # type: ignore
visited.add(child)
else:
doc_label = DocItemLabel.SECTION_HEADER
snippet_text = "" # inline group will be created
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings: List[str] = []
# Define a recursive function to traverse the tree
def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
if isinstance(node.children, list):
for child in node.children:
traverse(child)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
if doc_label == DocItemLabel.SECTION_HEADER:
if element.level == 1:
parent_item = doc.add_title(
text=snippet_text,
parent=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
else:
parent_item = doc.add_heading(
text=snippet_text,
level=element.level - 1,
parent=parent_item,
)
else:
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.block.List):
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
break
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif (
isinstance(element, marko.block.ListItem)
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
and len(element.children) == 1
and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0
):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(" - List item")
snippet_text = str(first_child.children[0].children) # type: ignore
is_numbered = False
if (
parent_item is not None
and isinstance(parent_item, DocItem)
and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_item, text=snippet_text
if len(child.children) == 1:
snippet_text = str(child.children[0].children) # type: ignore
visited.add(child)
else:
snippet_text = "" # inline group will be created
is_numbered = isinstance(parent_item, OrderedList)
if not isinstance(parent_item, (OrderedList, UnorderedList)):
_log.warning("ListItem would have not had a list parent, adding one.")
parent_item = doc.add_unordered_list(parent=parent_item)
parent_item = doc.add_list_item(
enumerated=is_numbered,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
visited.add(first_child)
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
label=DocItemLabel.CAPTION,
text=element.title,
formatting=formatting,
hyperlink=hyperlink,
)
doc.add_picture(parent=parent_item, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self._process_inline_text(parent_item, doc)
elif isinstance(element, marko.inline.Emphasis):
_log.debug(f" - Emphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.italic = True
elif isinstance(element, marko.inline.StrongEmphasis):
_log.debug(f" - StrongEmphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.bold = True
elif isinstance(element, marko.inline.Link):
_log.debug(f" - Link: {element.children}")
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
element.dest
)
elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
else:
self.md_table_buffer.append(snippet_text)
else:
elif snippet_text:
self._close_table(doc)
# most likely just inline text
self.inline_texts.append(str(element.children))
doc.add_text(
label=DocItemLabel.TEXT,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_code(parent=parent_item, text=snippet_text)
doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
and isinstance((child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (child.children.strip())) > 0
):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}")
doc.add_code(parent=parent_item, text=snippet_text)
doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug(f"HTML Block: {element}")
if (
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_item, text=text_to_add)
doc.add_code(
parent=parent_item,
text=text_to_add,
formatting=formatting,
hyperlink=hyperlink,
)
else:
if not isinstance(element, str):
self._close_table(doc)
_log.debug(f"Some other element: {element}")
if (
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
and len(element.children) > 1
):
parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = (
marko.block.Heading,
# marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
marko.inline.RawText,
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc=doc,
visited=visited,
parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
def is_valid(self) -> bool:
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parent_item=None,
visited=set(),
)
self._process_inline_text(None, doc) # handle last hanging inline text
self._close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend

View File

@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
from PIL import Image, UnidentifiedImageError
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from pptx.oxml.text import CT_TextLineBreak
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
@ -120,97 +121,72 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
is_a_list = False
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
is_list_group_created = False
enum_list_item_value = 0
new_list = None
bullet_type = "None"
list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
# Identify if shape contains lists
for paragraph in shape.text_frame.paragraphs:
# Check if paragraph is a bullet point using the `element` XML
def is_list_item(paragraph):
"""Check if the paragraph is a list item."""
p = paragraph._element
if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Bullet"
is_a_list = True
return (True, "Bullet")
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Numbered"
is_a_list = True
else:
is_a_list = False
if paragraph.level > 0:
return (True, "Numbered")
elif paragraph.level > 0:
# Most likely a sub-list
is_a_list = True
if is_a_list:
# Determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
if bullet_type == "Numbered":
list_label = GroupLabel.ORDERED_LIST
if is_a_list:
_log.debug("LIST DETECTED!")
return (True, "None")
else:
_log.debug("No List")
# If there is a list inside of the shape, create a new docling list to assign list items to
# if is_a_list:
# new_list = doc.add_group(
# label=list_label, name=f"list", parent=parent_slide
# )
return (False, "None")
# Iterate through paragraphs to build up text
for paragraph in shape.text_frame.paragraphs:
# p_text = paragraph.text.strip()
is_a_list, bullet_type = is_list_item(paragraph)
p = paragraph._element
enum_list_item_value += 1
inline_paragraph_text = ""
inline_list_item_text = ""
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
if len(e.text.strip()) > 0:
e_is_a_list_item = False
is_numbered = False
if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Bullet"
e_is_a_list_item = True
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Numbered"
is_numbered = True
e_is_a_list_item = True
# Convert line breaks to spaces and accumulate text
p_text = ""
for e in p.content_children:
if isinstance(e, CT_TextLineBreak):
p_text += " "
else:
e_is_a_list_item = False
p_text += e.text
if e_is_a_list_item:
if len(inline_paragraph_text) > 0:
# output accumulated inline text:
doc.add_text(
label=doc_label,
if is_a_list:
enum_marker = ""
enumerated = bullet_type == "Numbered"
if not is_list_group_created:
new_list = doc.add_group(
label=GroupLabel.ORDERED_LIST
if enumerated
else GroupLabel.LIST,
name="list",
parent=parent_slide,
text=inline_paragraph_text,
)
is_list_group_created = True
enum_list_item_value = 0
if enumerated:
enum_list_item_value += 1
enum_marker = str(enum_list_item_value) + "."
doc.add_list_item(
marker=enum_marker,
enumerated=enumerated,
parent=new_list,
text=p_text,
prov=prov,
)
# Set marker and enumerated arguments if this is an enumeration element.
inline_list_item_text += e.text
# print(e.text)
else:
else: # is paragraph not a list item
# Assign proper label to the text, depending if it's a Title or Section Header
# For other types of text, assign - PARAGRAPH
doc_label = DocItemLabel.PARAGRAPH
@ -224,32 +200,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
doc_label = DocItemLabel.TITLE
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
DocItemLabel.SECTION_HEADER
enum_list_item_value = 0
inline_paragraph_text += e.text
if len(inline_paragraph_text) > 0:
# output accumulated inline text:
doc.add_text(
label=doc_label,
parent=parent_slide,
text=inline_paragraph_text,
prov=prov,
)
if len(inline_list_item_text) > 0:
enum_marker = ""
if is_numbered:
enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
new_list = doc.add_group(
label=list_label, name="list", parent=parent_slide
)
is_list_group_created = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_list,
text=inline_list_item_text,
text=p_text,
prov=prov,
)
return

View File

@ -14,7 +14,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import Formatting
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
) from e
@override
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_tables(element, docx_obj, doc)
except Exception:
_log.debug("could not parse a table, broken docx table")
# Check for Image
elif drawing_blip:
self._handle_pictures(docx_obj, drawing_blip, doc)
# Check for Text after the Image
if (
tag_name in ["p"]
and element.find(".//w:t", namespaces=namespaces) is not None
):
self._handle_text_elements(element, docx_obj, doc)
# Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def _str_to_int(
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
all_paragraphs = []
# Sort paragraphs within each container, then process containers
for container_id, paragraphs in container_paragraphs.items():
for paragraphs in container_paragraphs.values():
# Sort by vertical position within each container
sorted_container_paragraphs = sorted(
paragraphs,
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
) -> None:
paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text(
element=element, text=paragraph.text
)
if text is None:
return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip()
# Common styles for bullet and numbered lists.
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
return
def _add_formatted_list_item(
self,
doc: DoclingDocument,
elements: list,
marker: str,
enumerated: bool,
level: int,
) -> None:
# This should not happen by construction
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
return
if len(elements) == 1:
text, format, hyperlink = elements[0]
doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text=text,
formatting=format,
hyperlink=hyperlink,
)
else:
new_item = doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text="",
)
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
for text, format, hyperlink in elements:
doc.add_text(
label=DocItemLabel.TEXT,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
def _add_list_item(
self,
*,
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elements: list,
is_numbered: bool = False,
) -> None:
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return None
enum_marker = ""
level = self._get_level()
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level],
paragraph_elements=elements,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self._prev_numid() == numid
and self.level_at_new_list is not None
@ -981,20 +1016,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
elif (
self._prev_numid() == numid
@ -1002,7 +1029,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and prev_indent is not None
and ilevel < prev_indent
): # Close list
for k, v in self.parents.items():
for k in self.parents:
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
@ -1011,19 +1038,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
self.listIter = 0
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level - 1],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1
)
return
def _handle_tables(

View File

@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.utils.locks import pypdfium2_lock
def get_pdf_page_geometry(
ppage: pdfium.PdfPage,
angle: float = 0.0,
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
) -> PdfPageGeometry:
"""
Create PdfPageGeometry from a pypdfium2 PdfPage object.
Args:
ppage: pypdfium2 PdfPage object
angle: Page rotation angle in degrees (default: 0.0)
boundary_type: The boundary type for the page (default: CROP_BOX)
Returns:
PdfPageGeometry with all the different bounding boxes properly set
"""
with pypdfium2_lock:
# Get the main bounding box (intersection of crop_box and media_box)
bbox_tuple = ppage.get_bbox()
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
# Get all the different page boxes from pypdfium2
media_box_tuple = ppage.get_mediabox()
crop_box_tuple = ppage.get_cropbox()
art_box_tuple = ppage.get_artbox()
bleed_box_tuple = ppage.get_bleedbox()
trim_box_tuple = ppage.get_trimbox()
# Convert to BoundingBox objects using existing from_tuple method
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
# Use bbox as fallback when specific box types are not defined
media_bbox = (
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
if media_box_tuple
else bbox
)
crop_bbox = (
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
if crop_box_tuple
else bbox
)
art_bbox = (
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
if art_box_tuple
else bbox
)
bleed_bbox = (
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
if bleed_box_tuple
else bbox
)
trim_bbox = (
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
if trim_box_tuple
else bbox
)
return PdfPageGeometry(
angle=angle,
rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=boundary_type,
art_bbox=art_bbox,
bleed_bbox=bleed_bbox,
crop_bbox=crop_bbox,
media_bbox=media_bbox,
trim_bbox=trim_bbox,
)
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
def is_valid(self) -> bool:
return self.valid
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from pypdfium."""
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
return merged_cells
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
return merge_horizontal_cells(cells)
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge:
# draw_clusters_and_cells()
cells = merge_horizontal_cells(cells)
# after merge:
# draw_clusters_and_cells()
return cells
def get_text_cells(self) -> Iterable[TextCell]:
return self._compute_text_cells()
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None

View File

@ -235,7 +235,6 @@ class Page(BaseModel):
page_no: int
# page_hash: Optional[str] = None
size: Optional[Size] = None
cells: List[TextCell] = []
parsed_page: Optional[SegmentedPdfPage] = None
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None
@ -248,12 +247,27 @@ class Page(BaseModel):
float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
def cells(self) -> List[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
else:
return []
def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
self,
scale: float = 1.0,
max_size: Optional[int] = None,
cropbox: Optional[BoundingBox] = None,
) -> Optional[Image]:
if self._backend is None:
return self._image_cache.get(scale, None)
if max_size:
assert self.size is not None
scale = min(scale, max_size / max(self.size.as_tuple()))
if scale not in self._image_cache:
if cropbox is None:
self._image_cache[scale] = self._backend.get_page_image(scale=scale)

View File

@ -302,7 +302,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
),
)
generate_parsed_pages: bool = False
generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
class ProcessingPipeline(str, Enum):

View File

@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
class BaseVlmOptions(BaseModel):
kind: str
prompt: str
scale: float = 2.0
max_size: Optional[int] = None
class ResponseFormat(str, Enum):
@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
AcceleratorDevice.MPS,
]
scale: float = 2.0
temperature: float = 0.0
stop_strings: List[str] = []
extra_generation_config: Dict[str, Any] = {}
@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
) # Default to ollama
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
scale: float = 2.0
timeout: float = 60
concurrency: int = 1
response_format: ResponseFormat

View File

@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
with TimeRecorder(conv_res, "vlm"):
assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale)
hi_res_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
assert hi_res_image is not None
if hi_res_image:
if hi_res_image.mode != "RGB":

View File

@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
coord_origin=bbox.coord_origin,
)
page_ix = element_prov.page_no - 1
page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=expanded_bbox
)

View File

@ -7,6 +7,7 @@ from typing import List, Optional, Type
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
return []
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
def _filter_ocr_cells(
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
) -> List[TextCell]:
# Create R-tree index for programmatic cells
p = index.Property()
p.dimension = 2
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
]
return filtered_ocr_cells
def post_process_cells(self, ocr_cells, programmatic_cells):
def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
r"""
Post-process the ocr and programmatic cells and return the final list of of cells
Post-process the OCR cells and update the page object.
Updates parsed_page.textline_cells directly since page.cells is now read-only.
"""
if self.options.force_full_page_ocr:
# If a full page OCR is forced, use only the OCR cells
cells = ocr_cells
return cells
# Get existing cells from the read-only property
existing_cells = page.cells
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells
# Combine existing and OCR cells with overlap filtering
final_cells = self._combine_cells(existing_cells, ocr_cells)
assert page.parsed_page is not None
# Update parsed_page.textline_cells directly
page.parsed_page.textline_cells = final_cells
page.parsed_page.has_lines = len(final_cells) > 0
def _combine_cells(
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
) -> List[TextCell]:
"""Combine existing and OCR cells with filtering and re-indexing."""
if self.options.force_full_page_ocr:
combined = ocr_cells
else:
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
combined = list(existing_cells) + filtered_ocr_cells
# Re-index in-place
for i, cell in enumerate(combined):
cell.index = i
return combined
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image)

View File

@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
# Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor(
page.cells, clusters, page.size
page, clusters
).postprocess()
# processed_clusters, processed_cells = clusters, page.cells
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
with warnings.catch_warnings():
warnings.filterwarnings(
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
)
)
page.cells = processed_cells
page.predictions.layout = LayoutPrediction(
clusters=processed_clusters
)

View File

@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -2,7 +2,7 @@ import re
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from typing import Literal, Optional
import numpy as np
from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
assert page.parsed_page is not None
# Rate the text quality from the PDF parser, and aggregate on page
text_scores = []

View File

@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -124,7 +124,7 @@ class ReadingOrderModel:
page_no = page.page_no + 1
size = page.size
assert size is not None
assert size is not None, "Page size is not initialized."
out_doc.add_page(page_no=page_no, size=size)

View File

@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
all_ocr_cells.append(cell)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
with TimeRecorder(conv_res, "vlm"):
assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale)
hi_res_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
# Define prompt structure
prompt = self.formulate_prompt()

View File

@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
assert page.size is not None
hi_res_image = page.get_image(scale=self.vlm_options.scale)
hi_res_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
if hi_res_image is not None:
im_width, im_height = hi_res_image.size

View File

@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
)
raise e
# Filter out uninitialized pages (those with size=None) that may remain
# after timeout or processing failures to prevent assertion errors downstream
initial_page_count = len(conv_res.pages)
conv_res.pages = [page for page in conv_res.pages if page.size is not None]
if len(conv_res.pages) < initial_page_count:
_log.info(
f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
f"due to timeout or processing failures"
)
return conv_res
def _unload(self, conv_res: ConversionResult) -> ConversionResult:

View File

@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
PagePreprocessingModel(
options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale,
create_parsed_page=pipeline_options.generate_parsed_pages,
)
),
# OCR

View File

@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
from docling_core.types.doc.page import TextCell
from rtree import index
from docling.datamodel.base_models import BoundingBox, Cluster
from docling.datamodel.base_models import BoundingBox, Cluster, Page
_log = logging.getLogger(__name__)
@ -194,11 +194,11 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
"""Initialize processor with cells and clusters."""
"""Initialize processor with cells and spatial indices."""
self.cells = cells
self.page_size = page_size
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
"""Initialize processor with page and clusters."""
self.cells = page.cells
self.page = page
self.page_size = page.size
self.all_clusters = clusters
self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES
@ -240,6 +240,10 @@ class LayoutPostprocessor:
for child in cluster.children:
child.cells = self._sort_cells(child.cells)
assert self.page.parsed_page is not None
self.page.parsed_page.textline_cells = self.cells
self.page.parsed_page.has_lines = len(self.cells) > 0
return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]:
@ -301,6 +305,7 @@ class LayoutPostprocessor:
special_clusters = self._handle_cross_type_overlaps(special_clusters)
# Calculate page area from known page size
assert self.page_size is not None
page_area = self.page_size.width * self.page_size.height
if page_area > 0:
# Filter out full-page pictures

View File

@ -121,14 +121,15 @@ def export_documents(
def main():
logging.basicConfig(level=logging.INFO)
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_paths = [
Path("./tests/data/pdf/2206.01062.pdf"),
Path("./tests/data/pdf/2203.01017v2.pdf"),
Path("./tests/data/pdf/2305.03393v1.pdf"),
Path("./tests/data/pdf/redp5110_sampled.pdf"),
data_folder / "pdf/2206.01062.pdf",
data_folder / "pdf/2203.01017v2.pdf",
data_folder / "pdf/2305.03393v1.pdf",
data_folder / "pdf/redp5110_sampled.pdf",
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# buf = BytesIO((data_folder / "pdf/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)

View File

@ -16,7 +16,8 @@ _log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
###########################################################################

View File

@ -71,7 +71,8 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2203.01017v2.pdf"
pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
pipeline_options.do_formula_understanding = True

View File

@ -76,7 +76,8 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0

View File

@ -16,7 +16,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -19,7 +19,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -12,7 +12,8 @@ _log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch")
doc_converter = DocumentConverter()

View File

@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
@ -32,7 +33,7 @@ def main():
}
)
doc = converter.convert(input_doc).document
doc = converter.convert(input_doc_path).document
md = doc.export_to_markdown()
print(md)

View File

@ -96,7 +96,8 @@ def watsonx_vlm_options():
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
pipeline_options = PdfPipelineOptions(
enable_remote_services=True # <-- this is required!

View File

@ -10,7 +10,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
# Explicitly set the accelerator
# accelerator_options = AcceleratorOptions(
@ -47,7 +48,7 @@ def main():
settings.debug.profile_pipeline_timings = True
# Convert the document
conversion_result = converter.convert(input_doc)
conversion_result = converter.convert(input_doc_path)
doc = conversion_result.document
# List with total time per document

View File

@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"])
@ -27,7 +28,7 @@ def main():
}
)
doc = converter.convert(input_doc).document
doc = converter.convert(input_doc_path).document
md = doc.export_to_markdown()
print(md)

View File

@ -30,7 +30,8 @@ def translate(text: str, src: str = "en", dest: str = "de"):
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2206.01062.pdf"
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter

View File

@ -95,8 +95,8 @@ def watsonx_vlm_options(model: str, prompt: str):
def main():
logging.basicConfig(level=logging.INFO)
# input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
data_folder = Path(__file__).parent / "../../tests/data"
input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf"
pipeline_options = VlmPipelineOptions(
enable_remote_services=True # <-- this is required!

View File

@ -1,6 +1,6 @@
[project]
name = "docling"
version = "2.36.1" # DO NOT EDIT, updated automatically
version = "2.37.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [

29
tests/data/asciidoc/test_03.asciidoc vendored Normal file
View File

@ -0,0 +1,29 @@
:_mod-docs-content-type: PROCEDURE
:experimental:
[id="renaming-a-bookmark_{context}"]
= Renaming a bookmark
You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
Renaming the bookmark does not rename the folder.
.Procedure
. Right-click the bookmark in the side bar.
. Select *Rename…*.
+
image::rename-bookmark-menu.png[Rename bookmark menu]
. In the *Name* field, enter the new name for the bookmark.
+
image::rename-bookmark-text.png[Bookmark name field]
. Click btn:[Rename].
.Verification
* Check that the side bar lists the bookmark under the new name.
+
image::renamed-bookmark.png[Renamed bookmark]

BIN
tests/data/docx/word_image_anchors.docx vendored Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -10607,7 +10661,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -83405,7 +83464,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -87282,7 +87395,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -100075,7 +100193,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -103502,7 +103674,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -116054,7 +116231,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -122106,7 +122337,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -179742,7 +179978,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -182669,7 +182959,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -193709,7 +194004,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -198736,7 +199085,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -236872,7 +237226,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -242249,7 +242657,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -289112,7 +289525,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -294464,7 +294931,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -327043,7 +327515,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -329120,7 +329646,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -2632,7 +2686,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,61 @@
"width": 594.0,
"height": 774.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 594.0,
"r_y1": 0.0,
"r_x2": 594.0,
"r_y2": 774.0,
"r_x3": 0.0,
"r_y3": 774.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -2457,7 +2511,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -1032,7 +1086,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -4591,7 +4650,61 @@
"width": 595.2760009765625,
"height": 841.8900146484375
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2760009765625,
"r_y1": 0.0,
"r_x2": 595.2760009765625,
"r_y2": 841.8900146484375,
"r_x3": 0.0,
"r_y3": 841.8900146484375,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -5768,7 +5881,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -1057,7 +1111,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -5032,7 +5091,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -5734,7 +5847,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -8374,7 +8492,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -9676,7 +9848,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -14401,7 +14578,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -15928,7 +16159,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -21385,7 +21621,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -21512,7 +21802,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -657,7 +711,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -2982,7 +3041,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -3609,7 +3722,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -657,7 +711,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -3982,7 +4036,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.3200073242188,
"height": 842.0399780273438
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.3200073242188,
"r_y1": 0.0,
"r_x2": 595.3200073242188,
"r_y2": 842.0399780273438,
"r_x3": 0.0,
"r_y3": 842.0399780273438,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -1382,7 +1436,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -10607,7 +10661,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -83405,7 +83464,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -87282,7 +87395,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -100075,7 +100193,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -103502,7 +103674,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -116054,7 +116231,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -122106,7 +122337,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -179742,7 +179978,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -182669,7 +182959,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -193709,7 +194004,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -198736,7 +199085,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -236872,7 +237226,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -242249,7 +242657,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -289112,7 +289525,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -294464,7 +294931,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -327043,7 +327515,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -329120,7 +329646,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -2632,7 +2686,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,61 @@
"width": 594.0,
"height": 774.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 594.0,
"r_y1": 0.0,
"r_x2": 594.0,
"r_y2": 774.0,
"r_x3": 0.0,
"r_y3": 774.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 774.0,
"r": 594.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -2457,7 +2511,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -1032,7 +1086,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -4591,7 +4650,61 @@
"width": 595.2760009765625,
"height": 841.8900146484375
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2760009765625,
"r_y1": 0.0,
"r_x2": 595.2760009765625,
"r_y2": 841.8900146484375,
"r_x3": 0.0,
"r_y3": 841.8900146484375,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.8900146484375,
"r": 595.2760009765625,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -5768,7 +5881,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -0,0 +1,20 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both*** .
Create your feature branch: `git checkout -b feature/AmazingFeature` .
1. Pull the [**repository**](https://github.com/docling-project/docling) .
2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
4. Push to the branch ( `git push origin feature/AmazingFeature` )
5. Open a Pull Request
##
*Second* section
- **First** : Lorem ipsum.
- **Second** : Dolor `sit` amet.

View File

@ -0,0 +1,565 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0'
- $ref: '#/groups/1'
- $ref: '#/groups/2'
- $ref: '#/texts/27'
- $ref: '#/groups/8'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/2'
- $ref: '#/texts/3'
- $ref: '#/texts/4'
- $ref: '#/texts/5'
- $ref: '#/texts/6'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children:
- $ref: '#/texts/7'
- $ref: '#/texts/8'
- $ref: '#/texts/9'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/1'
- children:
- $ref: '#/texts/10'
- $ref: '#/texts/14'
- $ref: '#/texts/18'
- $ref: '#/texts/22'
- $ref: '#/texts/26'
content_layer: body
label: ordered_list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/2'
- children:
- $ref: '#/texts/11'
- $ref: '#/texts/12'
- $ref: '#/texts/13'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/10'
self_ref: '#/groups/3'
- children:
- $ref: '#/texts/15'
- $ref: '#/texts/16'
- $ref: '#/texts/17'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/14'
self_ref: '#/groups/4'
- children:
- $ref: '#/texts/19'
- $ref: '#/texts/20'
- $ref: '#/texts/21'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/18'
self_ref: '#/groups/5'
- children:
- $ref: '#/texts/23'
- $ref: '#/texts/24'
- $ref: '#/texts/25'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/22'
self_ref: '#/groups/6'
- children:
- $ref: '#/texts/28'
- $ref: '#/texts/29'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/27'
self_ref: '#/groups/7'
- children:
- $ref: '#/texts/30'
- $ref: '#/texts/33'
content_layer: body
label: list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/8'
- children:
- $ref: '#/texts/31'
- $ref: '#/texts/32'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/30'
self_ref: '#/groups/9'
- children:
- $ref: '#/texts/34'
- $ref: '#/texts/35'
- $ref: '#/texts/36'
- $ref: '#/texts/37'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/33'
self_ref: '#/groups/10'
key_value_items: []
name: inline_and_formatting
origin:
binary_hash: 9342273634728023910
filename: inline_and_formatting.md
mimetype: text/markdown
pages: {}
pictures: []
schema_name: DoclingDocument
tables: []
texts:
- children: []
content_layer: body
label: title
orig: Contribution guideline example
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Contribution guideline example
- children: []
content_layer: body
label: text
orig: This is simple.
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/1'
text: This is simple.
- children: []
content_layer: body
label: text
orig: Foo
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: Foo
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/3'
text: emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: strong emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/4'
text: strong emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: true
strikethrough: false
underline: false
label: text
orig: both
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/5'
text: both
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/6'
text: .
- children: []
content_layer: body
label: text
orig: 'Create your feature branch:'
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/7'
text: 'Create your feature branch:'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/1'
prov: []
references: []
self_ref: '#/texts/8'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/9'
text: .
- children:
- $ref: '#/groups/3'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/10'
text: ''
- children: []
content_layer: body
label: text
orig: Pull the
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/11'
text: Pull the
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
hyperlink: https://github.com/docling-project/docling
label: text
orig: repository
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/12'
text: repository
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/13'
text: .
- children:
- $ref: '#/groups/4'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/14'
text: ''
- children: []
content_layer: body
label: text
orig: Create your feature branch (
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/15'
text: Create your feature branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/4'
prov: []
references: []
self_ref: '#/texts/16'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/17'
text: )
- children:
- $ref: '#/groups/5'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/18'
text: ''
- children: []
content_layer: body
label: text
orig: Commit your changes (
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/19'
text: Commit your changes (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git commit -m 'Add some AmazingFeature'
parent:
$ref: '#/groups/5'
prov: []
references: []
self_ref: '#/texts/20'
text: git commit -m 'Add some AmazingFeature'
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/21'
text: )
- children:
- $ref: '#/groups/6'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/22'
text: ''
- children: []
content_layer: body
label: text
orig: Push to the branch (
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/23'
text: Push to the branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git push origin feature/AmazingFeature
parent:
$ref: '#/groups/6'
prov: []
references: []
self_ref: '#/texts/24'
text: git push origin feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/25'
text: )
- children: []
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: Open a Pull Request
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/26'
text: Open a Pull Request
- children:
- $ref: '#/groups/7'
content_layer: body
label: section_header
level: 1
orig: ''
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/27'
text: ''
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/28'
text: Second
- children: []
content_layer: body
label: text
orig: section
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/29'
text: section
- children:
- $ref: '#/groups/9'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/30'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: First
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/31'
text: First
- children: []
content_layer: body
label: text
orig: ': Lorem ipsum.'
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/32'
text: ': Lorem ipsum.'
- children:
- $ref: '#/groups/10'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/33'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/34'
text: Second
- children: []
content_layer: body
label: text
orig: ': Dolor'
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/35'
text: ': Dolor'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: sit
parent:
$ref: '#/groups/10'
prov: []
references: []
self_ref: '#/texts/36'
text: sit
- children: []
content_layer: body
label: text
orig: amet.
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/37'
text: amet.
version: 1.3.0

View File

@ -5,7 +5,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -1057,7 +1111,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -5032,7 +5091,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -5734,7 +5847,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -8374,7 +8492,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -9676,7 +9848,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -14401,7 +14578,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -15928,7 +16159,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -21385,7 +21621,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -21512,7 +21802,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -657,7 +711,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [
@ -2982,7 +3041,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -3609,7 +3722,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -0,0 +1,3 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: chapter: group slide-0
item-2 at level 2: title: X-Library The fully customisable ... llection exclusively for our customers

View File

@ -0,0 +1,86 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "powerpoint_bad_text",
"origin": {
"mimetype": "application/vnd.ms-powerpoint",
"binary_hash": 1443005848482130016,
"filename": "powerpoint_bad_text.pptx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "slide-0",
"label": "chapter"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "title",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 1041400.0,
"t": 4582390.0,
"r": 8083550.0,
"b": 1689099.0,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
118
]
}
],
"orig": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers",
"text": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers"
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 12190413.0,
"height": 6858000.0
},
"page_no": 1
}
}
}

View File

@ -0,0 +1 @@
# X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,61 @@
"width": 612.0,
"height": 792.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 612.0,
"r_y1": 0.0,
"r_x2": 612.0,
"r_y2": 792.0,
"r_x3": 0.0,
"r_y3": 792.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 792.0,
"r": 612.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -657,7 +711,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.2000122070312,
"height": 841.9199829101562
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.2000122070312,
"r_y1": 0.0,
"r_x2": 595.2000122070312,
"r_y2": 841.9199829101562,
"r_x3": 0.0,
"r_y3": 841.9199829101562,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9199829101562,
"r": 595.2000122070312,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -3982,7 +4036,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.3200073242188,
"height": 842.0399780273438
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.3200073242188,
"r_y1": 0.0,
"r_x2": 595.3200073242188,
"r_y2": 842.0399780273438,
"r_x3": 0.0,
"r_y3": 842.0399780273438,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 842.0399780273438,
"r": 595.3200073242188,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -1382,7 +1436,12 @@
"from_ocr": false
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -0,0 +1,23 @@
:\_mod-docs-content-type: PROCEDURE :experimental:
# Renaming a bookmark
[id="renaming-a-bookmark\_{context}"]
You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
Renaming the bookmark does not rename the folder.
- Check that the side bar lists the bookmark under the new name.
Procedure . Right-click the bookmark in the side bar. . Select *Rename…*. +
<!-- image -->
In the *Name* field, enter the new name for the bookmark. +
<!-- image -->
Click btn:[Rename]. .Verification
<!-- image -->

View File

@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2
item-18 at level 2: list_item: Underline bullet 3
item-19 at level 2: inline: group group
item-20 at level 3: list_item: Some
item-21 at level 3: list_item: italic
item-22 at level 3: list_item: bold
item-23 at level 3: list_item: underline
item-24 at level 2: list: group list
item-25 at level 3: inline: group group
item-26 at level 4: list_item: Nested
item-27 at level 4: list_item: italic
item-28 at level 4: list_item: bold
item-29 at level 1: paragraph:
item-19 at level 2: list_item:
item-20 at level 3: inline: group group
item-21 at level 4: text: Some
item-22 at level 4: text: italic
item-23 at level 4: text: bold
item-24 at level 4: text: underline
item-25 at level 2: list: group list
item-26 at level 3: list_item:
item-27 at level 4: inline: group group
item-28 at level 5: text: Nested
item-29 at level 5: text: italic
item-30 at level 5: text: bold
item-31 at level 1: paragraph:

View File

@ -42,7 +42,7 @@
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/23"
"$ref": "#/texts/25"
}
],
"content_layer": "body",
@ -98,7 +98,7 @@
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/2"
"$ref": "#/texts/16"
},
{
"$ref": "#/groups/3"
@ -111,12 +111,9 @@
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/texts/16"
},
"children": [
{
"$ref": "#/texts/17"
},
@ -125,6 +122,9 @@
},
{
"$ref": "#/texts/19"
},
{
"$ref": "#/texts/20"
}
],
"content_layer": "body",
@ -138,7 +138,7 @@
},
"children": [
{
"$ref": "#/groups/4"
"$ref": "#/texts/21"
}
],
"content_layer": "body",
@ -148,17 +148,17 @@
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/groups/3"
"$ref": "#/texts/21"
},
"children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
{
"$ref": "#/texts/22"
},
{
"$ref": "#/texts/23"
},
{
"$ref": "#/texts/24"
}
],
"content_layer": "body",
@ -461,20 +461,18 @@
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/groups/2"
"$ref": "#/groups/1"
},
"children": [],
"children": [
{
"$ref": "#/groups/2"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Some",
"text": "Some",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
},
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
@ -485,18 +483,16 @@
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
"orig": "Some",
"text": "Some",
"formatting": {
"bold": false,
"italic": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
}
},
{
"self_ref": "#/texts/18",
@ -505,67 +501,7 @@
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
@ -574,7 +510,59 @@
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/3"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
@ -585,7 +573,43 @@
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "text",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
@ -594,12 +618,10 @@
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
}
},
{
"self_ref": "#/texts/23",
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/body"
},

View File

@ -0,0 +1,16 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Transcript
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
item-3 at level 1: picture
item-4 at level 1: inline: group group
item-5 at level 2: paragraph: This is test 1
item-6 at level 2: paragraph: 0:08
Correct, he is not.
item-7 at level 1: paragraph:
item-8 at level 1: picture
item-9 at level 1: inline: group group
item-10 at level 2: paragraph: This is test 2
item-11 at level 2: paragraph: 0:16
Yeah, exactly.
item-12 at level 1: paragraph:
item-13 at level 1: paragraph:

View File

@ -0,0 +1,286 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "word_image_anchors",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 2428692234257307633,
"filename": "word_image_anchors.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/pictures/1"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Transcript",
"text": "Transcript",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "February 20, 2025, 8:32PM",
"text": "February 20, 2025, 8:32PM",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is test 1",
"text": "This is test 1",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "0:08\nCorrect, he is not.",
"text": "0:08\nCorrect, he is not.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is test 2",
"text": "This is test 2",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "0:16\nYeah, exactly.",
"text": "0:16\nYeah, exactly.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 72,
"size": {
"width": 100.0,
"height": 100.0
},
"uri": ""
},
"annotations": []
},
{
"self_ref": "#/pictures/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 72,
"size": {
"width": 100.0,
"height": 100.0
},
"uri": ""
},
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,13 @@
**Transcript**
February 20, 2025, 8:32PM
<!-- image -->
**This is test 1** 0:08
Correct, he is not.
<!-- image -->
**This is test 2** 0:16
Yeah, exactly.

18
tests/data/md/inline_and_formatting.md vendored Normal file
View File

@ -0,0 +1,18 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both***.
Create your feature branch: `git checkout -b feature/AmazingFeature`.
1. Pull the [**repository**](https://github.com/docling-project/docling).
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
4. Push to the branch (`git push origin feature/AmazingFeature`)
5. Open a Pull Request
## *Second* section <!-- inline groups in headings not yet supported by serializers -->
- **First**: Lorem ipsum.
- **Second**: Dolor `sit` amet.

BIN
tests/data/pptx/powerpoint_bad_text.pptx vendored Normal file

Binary file not shown.

View File

@ -5,7 +5,77 @@
"width": 2000.0,
"height": 2829.0
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 2000.0,
"r_y1": 0.0,
"r_x2": 2000.0,
"r_y2": 2829.0,
"r_x3": 0.0,
"r_y3": 2829.0,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [
{
"index": 0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 2000.0,
"r_y1": 0.0,
"r_x2": 2000.0,
"r_y2": 2829.0,
"r_x3": 0.0,
"r_y3": 2829.0,
"coord_origin": "BOTTOMLEFT"
},
"uri": null
}
],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +152,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,7 +5,61 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
@ -82,7 +136,12 @@
"from_ocr": true
}
],
"parsed_page": null,
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -2,7 +2,11 @@ import glob
import os
from pathlib import Path
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.asciidoc_backend import (
DEFAULT_IMAGE_HEIGHT,
DEFAULT_IMAGE_WIDTH,
AsciiDocBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@ -18,6 +22,24 @@ def _get_backend(fname):
return doc_backend
def test_parse_picture():
line = (
"image::images/example1.png[Example Image, width=200, height=150, align=center]"
)
res = AsciiDocBackend._parse_picture(line)
assert res
assert res.get("width", 0) == "200"
assert res.get("height", 0) == "150"
assert res.get("uri", "") == "images/example1.png"
line = "image::renamed-bookmark.png[Renamed bookmark]"
res = AsciiDocBackend._parse_picture(line)
assert res
assert "width" not in res
assert "height" not in res
assert res.get("uri", "") == "renamed-bookmark.png"
def test_asciidocs_examples():
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))

View File

@ -2,7 +2,7 @@ from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docling.datamodel.document import DoclingDocument, InputDocument
from .test_data_gen_flag import GEN_TEST_DATA
@ -11,12 +11,15 @@ def test_convert_valid():
fmt = InputFormat.MD
cls = MarkdownDocumentBackend
test_data_path = Path("tests") / "data"
relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
root_path = Path("tests") / "data"
relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"]
for in_path in relevant_paths:
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
in_doc = InputDocument(
path_or_stream=in_path,
@ -33,9 +36,17 @@ def test_convert_valid():
act_data = act_doc.export_to_markdown()
if GEN_TEST_DATA:
with open(gt_path, mode="w", encoding="utf-8") as f:
with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")
if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
act_doc.save_as_yaml(yaml_gt_path)
else:
with open(gt_path, encoding="utf-8") as f:
with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert exp_data == act_data
assert act_data == exp_data
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc

View File

@ -9,6 +9,7 @@ from docling.datamodel.document import (
DoclingDocument,
InputDocument,
SectionHeaderItem,
TextItem,
)
from docling.document_converter import DocumentConverter
@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
f"export to markdown failed on {docx_path}"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
f"export to indented-text failed on {docx_path}"
)
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
"document document"
f"DoclingDocument verification failed on {docx_path}"
)
if docx_path.name == "word_tables.docx":
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_text=pred_html,
gtfile=str(gt_path) + ".html",
generate=GENERATE,
), "export to html"
), f"export to html failed on {docx_path}"
flaky_path = Path("tests/data/docx/textbox.docx")
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
@pytest.mark.xfail(strict=False)
def test_textbox_conversion():
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
def test_text_after_image_anchors():
"""
Test to analyse whether text gets parsed after image anchors.
"""
in_path = Path("tests/data/docx/word_image_anchors.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_text_after_anchor_1 = found_text_after_anchor_2 = (
found_text_after_anchor_3
) = found_text_after_anchor_4 = False
for item, _ in doc.iterate_items():
if isinstance(item, TextItem):
if item.text == "This is test 1":
found_text_after_anchor_1 = True
elif item.text == "0:08\nCorrect, he is not.":
found_text_after_anchor_2 = True
elif item.text == "This is test 2":
found_text_after_anchor_3 = True
elif item.text == "0:16\nYeah, exactly.":
found_text_after_anchor_4 = True
assert (
found_text_after_anchor_1
and found_text_after_anchor_2
and found_text_after_anchor_3
and found_text_after_anchor_4
)

View File

@ -60,3 +60,25 @@ def test_code_and_formula_conversion():
gt = "a ^ { 2 } + 8 = 1 2"
predicted = formula_blocks[0].text
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
def test_formula_conversion_with_page_range():
pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
converter = get_converter()
print(f"converting {pdf_path} with page range")
doc_result: ConversionResult = converter.convert(pdf_path, page_range=(2, 2))
results = doc_result.document.texts
formula_blocks = [
el
for el in results
if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA
]
assert len(formula_blocks) == 1
gt = "a ^ { 2 } + 8 = 1 2"
predicted = formula_blocks[0].text
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"

View File

@ -57,14 +57,14 @@ def test_e2e_conversions():
pdf_paths = get_pdf_paths()
engines: List[Tuple[OcrOptions, bool]] = [
(EasyOcrOptions(), False),
(TesseractOcrOptions(), True),
(TesseractCliOcrOptions(), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
(EasyOcrOptions(), False),
(TesseractOcrOptions(force_full_page_ocr=True), True),
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
]
# rapidocr is only available for Python >=3.6,<3.13

2
uv.lock generated
View File

@ -818,7 +818,7 @@ wheels = [
[[package]]
name = "docling"
version = "2.36.1"
version = "2.37.0"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },