mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Merge branch 'main' of github.com:DS4SD/docling into dev/add-asr-pipeline-v2
This commit is contained in:
commit
caf18e634b
2
.github/dco.yml
vendored
Normal file
2
.github/dco.yml
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
allowRemediationCommits:
|
||||
individual: true
|
192
.github/workflows/dco-advisor.yml
vendored
Normal file
192
.github/workflows/dco-advisor.yml
vendored
Normal file
@ -0,0 +1,192 @@
|
||||
name: DCO Advisor Bot
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, reopened, synchronize]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
dco_advisor:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Handle DCO check result
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
script: |
|
||||
const pr = context.payload.pull_request || context.payload.check_run?.pull_requests?.[0];
|
||||
if (!pr) return;
|
||||
|
||||
const prNumber = pr.number;
|
||||
const baseRef = pr.base.ref;
|
||||
const headSha =
|
||||
context.payload.check_run?.head_sha ||
|
||||
pr.head?.sha;
|
||||
const username = pr.user.login;
|
||||
|
||||
console.log("HEAD SHA:", headSha);
|
||||
|
||||
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
|
||||
|
||||
// Poll until DCO check has a conclusion (max 6 attempts, 30s)
|
||||
let dcoCheck = null;
|
||||
for (let attempt = 0; attempt < 6; attempt++) {
|
||||
const { data: checks } = await github.rest.checks.listForRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: headSha
|
||||
});
|
||||
|
||||
|
||||
console.log("All check runs:");
|
||||
checks.check_runs.forEach(run => {
|
||||
console.log(`- ${run.name} (${run.status}/${run.conclusion}) @ ${run.head_sha}`);
|
||||
});
|
||||
|
||||
dcoCheck = checks.check_runs.find(run =>
|
||||
run.name.toLowerCase().includes("dco") &&
|
||||
!run.name.toLowerCase().includes("dco_advisor") &&
|
||||
run.head_sha === headSha
|
||||
);
|
||||
|
||||
|
||||
if (dcoCheck?.conclusion) break;
|
||||
console.log(`Waiting for DCO check... (${attempt + 1})`);
|
||||
await sleep(5000); // wait 5 seconds
|
||||
}
|
||||
|
||||
if (!dcoCheck || !dcoCheck.conclusion) {
|
||||
console.log("DCO check did not complete in time.");
|
||||
return;
|
||||
}
|
||||
|
||||
const isFailure = ["failure", "action_required"].includes(dcoCheck.conclusion);
|
||||
console.log(`DCO check conclusion for ${headSha}: ${dcoCheck.conclusion} (treated as ${isFailure ? "failure" : "success"})`);
|
||||
|
||||
// Parse DCO output for commit SHAs and author
|
||||
let badCommits = [];
|
||||
let authorName = "";
|
||||
let authorEmail = "";
|
||||
let moreInfo = `More info: [DCO check report](${dcoCheck?.html_url})`;
|
||||
|
||||
if (isFailure) {
|
||||
const { data: commits } = await github.rest.pulls.listCommits({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: prNumber,
|
||||
});
|
||||
|
||||
for (const commit of commits) {
|
||||
const commitMessage = commit.commit.message;
|
||||
const signoffMatch = commitMessage.match(/^Signed-off-by:\s+.+<.+>$/m);
|
||||
if (!signoffMatch) {
|
||||
console.log(`Bad commit found ${commit.sha}`)
|
||||
badCommits.push({
|
||||
sha: commit.sha,
|
||||
authorName: commit.commit.author.name,
|
||||
authorEmail: commit.commit.author.email,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If multiple authors are present, you could adapt the message accordingly
|
||||
// For now, we'll just use the first one
|
||||
if (badCommits.length > 0) {
|
||||
authorName = badCommits[0].authorName;
|
||||
authorEmail = badCommits[0].authorEmail;
|
||||
}
|
||||
|
||||
// Generate remediation commit message if needed
|
||||
let remediationSnippet = "";
|
||||
if (badCommits.length && authorEmail) {
|
||||
remediationSnippet = `git commit --allow-empty -s -m "DCO Remediation Commit for ${authorName} <${authorEmail}>\n\n` +
|
||||
badCommits.map(c => `I, ${c.authorName} <${c.authorEmail}>, hereby add my Signed-off-by to this commit: ${c.sha}`).join('\n') +
|
||||
`"`;
|
||||
} else {
|
||||
remediationSnippet = "# Unable to auto-generate remediation message. Please check the DCO check details.";
|
||||
}
|
||||
|
||||
// Build comment
|
||||
const commentHeader = '<!-- dco-advice-bot -->';
|
||||
let body = "";
|
||||
|
||||
if (isFailure) {
|
||||
body = [
|
||||
commentHeader,
|
||||
'❌ **DCO Check Failed**',
|
||||
'',
|
||||
`Hi @${username}, your pull request has failed the Developer Certificate of Origin (DCO) check.`,
|
||||
'',
|
||||
'This repository supports **remediation commits**, so you can fix this without rewriting history — but you must follow the required message format.',
|
||||
'',
|
||||
'---',
|
||||
'',
|
||||
'### 🛠 Quick Fix: Add a remediation commit',
|
||||
'Run this command:',
|
||||
'',
|
||||
'```bash',
|
||||
remediationSnippet,
|
||||
'git push',
|
||||
'```',
|
||||
'',
|
||||
'---',
|
||||
'',
|
||||
'<details>',
|
||||
'<summary>🔧 Advanced: Sign off each commit directly</summary>',
|
||||
'',
|
||||
'**For the latest commit:**',
|
||||
'```bash',
|
||||
'git commit --amend --signoff',
|
||||
'git push --force-with-lease',
|
||||
'```',
|
||||
'',
|
||||
'**For multiple commits:**',
|
||||
'```bash',
|
||||
`git rebase --signoff origin/${baseRef}`,
|
||||
'git push --force-with-lease',
|
||||
'```',
|
||||
'',
|
||||
'</details>',
|
||||
'',
|
||||
moreInfo
|
||||
].join('\n');
|
||||
} else {
|
||||
body = [
|
||||
commentHeader,
|
||||
'✅ **DCO Check Passed**',
|
||||
'',
|
||||
`Thanks @${username}, all your commits are properly signed off. 🎉`
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
// Get existing comments on the PR
|
||||
const { data: comments } = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber
|
||||
});
|
||||
|
||||
// Look for a previous bot comment
|
||||
const existingComment = comments.find(c =>
|
||||
c.body.includes("<!-- dco-advice-bot -->")
|
||||
);
|
||||
|
||||
if (existingComment) {
|
||||
await github.rest.issues.updateComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: existingComment.id,
|
||||
body: body
|
||||
});
|
||||
} else {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
body: body
|
||||
});
|
||||
}
|
23
CHANGELOG.md
23
CHANGELOG.md
@ -1,3 +1,26 @@
|
||||
## [v2.37.0](https://github.com/docling-project/docling/releases/tag/v2.37.0) - 2025-06-16
|
||||
|
||||
### Feature
|
||||
|
||||
* Make Page.parsed_page the only source of truth for text cells, add OCR cells to it ([#1745](https://github.com/docling-project/docling/issues/1745)) ([`7d3302c`](https://github.com/docling-project/docling/commit/7d3302cb48dd91cd29673d7c4eaf7326736d0685))
|
||||
* Support xlsm files ([#1520](https://github.com/docling-project/docling/issues/1520)) ([`df14022`](https://github.com/docling-project/docling/commit/df140227c3b8bcad0c68bf3d129930cccd96a07e))
|
||||
|
||||
### Fix
|
||||
|
||||
* Pptx line break and space handling ([#1664](https://github.com/docling-project/docling/issues/1664)) ([`f28d23c`](https://github.com/docling-project/docling/commit/f28d23cf03d059619d1d3482594596ab7c87d197))
|
||||
* **asciidoc:** Set default size when missing in image directive ([#1769](https://github.com/docling-project/docling/issues/1769)) ([`b886e4d`](https://github.com/docling-project/docling/commit/b886e4df312447d39f58cf6e3c45b0f863940321))
|
||||
* Handle NoneType error in MsPowerpointDocumentBackend ([#1747](https://github.com/docling-project/docling/issues/1747)) ([`7a275c7`](https://github.com/docling-project/docling/commit/7a275c763731d9c96b7cf32f2e27b8dc8bebacd7))
|
||||
* Prov for merged-elems ([#1728](https://github.com/docling-project/docling/issues/1728)) ([`6613b9e`](https://github.com/docling-project/docling/commit/6613b9e98bc8b89791dc0334de8970ff243aba82))
|
||||
* **tesseract:** Initialize df_osd to avoid uninitialized variable error ([#1718](https://github.com/docling-project/docling/issues/1718)) ([`e979750`](https://github.com/docling-project/docling/commit/e979750ce93b2fae89dbb60ff06333f80c1c2908))
|
||||
* Allow custom torch_dtype in vlm models ([#1735](https://github.com/docling-project/docling/issues/1735)) ([`f7f3113`](https://github.com/docling-project/docling/commit/f7f31137f10999fefdb70da7e5ef56536f650400))
|
||||
* Improve extraction from textboxes in Word docs ([#1701](https://github.com/docling-project/docling/issues/1701)) ([`9dbcb3d`](https://github.com/docling-project/docling/commit/9dbcb3d7d4f27d1c935c8681c57ed59524452d53))
|
||||
* Add WEBP to the list of image file extensions ([#1711](https://github.com/docling-project/docling/issues/1711)) ([`a2b83fe`](https://github.com/docling-project/docling/commit/a2b83fe4aea66c273a83bf17177e87d45d3f18d1))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Update vlm models api examples with LM Studio ([#1759](https://github.com/docling-project/docling/issues/1759)) ([`0432a31`](https://github.com/docling-project/docling/commit/0432a31b2f7c9fe944c3a1d4b608ef938b4f2299))
|
||||
* Add open webui ([#1734](https://github.com/docling-project/docling/issues/1734)) ([`49b10e7`](https://github.com/docling-project/docling/commit/49b10e74191d4d580c9305ac08d9898a79346d7d))
|
||||
|
||||
## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04
|
||||
|
||||
### Fix
|
||||
|
@ -2,7 +2,7 @@ import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
from typing import Final, Set, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_IMAGE_WIDTH: Final = 128
|
||||
DEFAULT_IMAGE_HEIGHT: Final = 128
|
||||
|
||||
|
||||
class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
item = self._parse_picture(line)
|
||||
|
||||
size = None
|
||||
size: Size
|
||||
if "width" in item and "height" in item:
|
||||
size = Size(width=int(item["width"]), height=int(item["height"]))
|
||||
else:
|
||||
size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)
|
||||
|
||||
uri = None
|
||||
if (
|
||||
@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return doc
|
||||
|
||||
def _get_current_level(self, parents):
|
||||
@staticmethod
|
||||
def _get_current_level(parents):
|
||||
for k, v in parents.items():
|
||||
if v is None and k > 0:
|
||||
return k - 1
|
||||
|
||||
return 0
|
||||
|
||||
def _get_current_parent(self, parents):
|
||||
@staticmethod
|
||||
def _get_current_parent(parents):
|
||||
for k, v in parents.items():
|
||||
if v is None and k > 0:
|
||||
return parents[k - 1]
|
||||
@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return None
|
||||
|
||||
# ========= Title
|
||||
def _is_title(self, line):
|
||||
@staticmethod
|
||||
def _is_title(line):
|
||||
return re.match(r"^= ", line)
|
||||
|
||||
def _parse_title(self, line):
|
||||
@staticmethod
|
||||
def _parse_title(line):
|
||||
return {"type": "title", "text": line[2:].strip(), "level": 0}
|
||||
|
||||
# ========= Section headers
|
||||
def _is_section_header(self, line):
|
||||
@staticmethod
|
||||
def _is_section_header(line):
|
||||
return re.match(r"^==+\s+", line)
|
||||
|
||||
def _parse_section_header(self, line):
|
||||
@staticmethod
|
||||
def _parse_section_header(line):
|
||||
match = re.match(r"^(=+)\s+(.*)", line)
|
||||
|
||||
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
||||
@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
}
|
||||
|
||||
# ========= Lists
|
||||
def _is_list_item(self, line):
|
||||
@staticmethod
|
||||
def _is_list_item(line):
|
||||
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
|
||||
|
||||
def _parse_list_item(self, line):
|
||||
@staticmethod
|
||||
def _parse_list_item(line):
|
||||
"""Extract the item marker (number or bullet symbol) and the text of the item."""
|
||||
|
||||
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
|
||||
@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
}
|
||||
|
||||
# ========= Tables
|
||||
def _is_table_line(self, line):
|
||||
@staticmethod
|
||||
def _is_table_line(line):
|
||||
return re.match(r"^\|.*\|", line)
|
||||
|
||||
def _parse_table_line(self, line):
|
||||
@staticmethod
|
||||
def _parse_table_line(line):
|
||||
# Split table cells and trim extra spaces
|
||||
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
||||
|
||||
def _populate_table_as_grid(self, table_data):
|
||||
@staticmethod
|
||||
def _populate_table_as_grid(table_data):
|
||||
num_rows = len(table_data)
|
||||
|
||||
# Adjust the table data into a grid format
|
||||
@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return data
|
||||
|
||||
# ========= Pictures
|
||||
def _is_picture(self, line):
|
||||
@staticmethod
|
||||
def _is_picture(line):
|
||||
return re.match(r"^image::", line)
|
||||
|
||||
def _parse_picture(self, line):
|
||||
@staticmethod
|
||||
def _parse_picture(line):
|
||||
"""
|
||||
Parse an image macro, extracting its path and attributes.
|
||||
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
|
||||
@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return {"type": "picture", "uri": line}
|
||||
|
||||
# ========= Captions
|
||||
def _is_caption(self, line):
|
||||
@staticmethod
|
||||
def _is_caption(line):
|
||||
return re.match(r"^\.(.+)", line)
|
||||
|
||||
def _parse_caption(self, line):
|
||||
@staticmethod
|
||||
def _parse_caption(line):
|
||||
mtch = re.match(r"^\.(.+)", line)
|
||||
if mtch:
|
||||
text = mtch.group(1)
|
||||
@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return {"type": "caption", "text": ""}
|
||||
|
||||
# ========= Plain text
|
||||
def _parse_text(self, line):
|
||||
@staticmethod
|
||||
def _parse_text(line):
|
||||
return {"type": "text", "text": line.strip()}
|
||||
|
@ -7,12 +7,17 @@ from typing import List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from docling_parse.pdf_parsers import pdf_parser_v1
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from docling-parse data."""
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
cell_counter += 1
|
||||
|
||||
return cells
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.valid:
|
||||
return ""
|
||||
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
if not self.valid:
|
||||
return None
|
||||
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
# Get the PDF page geometry from pypdfium2
|
||||
dimension = get_pdf_page_geometry(self._ppage)
|
||||
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_lines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
# cells = merge_horizontal_cells(cells)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
PdfPageBoundaryType,
|
||||
PdfPageGeometry,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from docling_parse.pdf_parsers import pdf_parser_v2
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
||||
from docling.datamodel.base_models import Size
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from docling-parse v2 data."""
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
return cells
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.valid:
|
||||
return ""
|
||||
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
if not self.valid:
|
||||
return None
|
||||
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
# Get the PDF page geometry from pypdfium2
|
||||
dimension = get_pdf_page_geometry(self._ppage)
|
||||
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_textlines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
return self._dpage
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
page_size = self.get_size()
|
||||
|
||||
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
|
||||
|
||||
# for cell in self._dpage.textline_cells:
|
||||
# rect = cell.rect
|
||||
#
|
||||
# assert (
|
||||
# rect.to_bounding_box().l <= rect.to_bounding_box().r
|
||||
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
|
||||
# assert (
|
||||
# rect.to_bounding_box().t <= rect.to_bounding_box().b
|
||||
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
|
||||
|
||||
return self._dpage.textline_cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
||||
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
||||
) -> DoclingParseV4PageBackend:
|
||||
with pypdfium2_lock:
|
||||
return DoclingParseV4PageBackend(
|
||||
self.dp_doc.get_page(
|
||||
seg_page = self.dp_doc.get_page(
|
||||
page_no + 1,
|
||||
create_words=create_words,
|
||||
create_textlines=create_textlines,
|
||||
),
|
||||
)
|
||||
|
||||
# In Docling, all TextCell instances are expected with top-left origin.
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.textline_cells
|
||||
]
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.char_cells
|
||||
]
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.word_cells
|
||||
]
|
||||
|
||||
return DoclingParseV4PageBackend(
|
||||
seg_page,
|
||||
self._pdoc[page_no],
|
||||
)
|
||||
|
||||
|
@ -1,17 +1,15 @@
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
import marko
|
||||
import marko.element
|
||||
import marko.ext
|
||||
import marko.ext.gfm
|
||||
import marko.inline
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
@ -21,7 +19,9 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from marko import Markdown
|
||||
from pydantic import AnyUrl, TypeAdapter
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
self.in_table = False
|
||||
self.md_table_buffer: list[str] = []
|
||||
self.inline_texts: list[str] = []
|
||||
self._html_blocks: int = 0
|
||||
|
||||
try:
|
||||
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc.add_table(data=table_data)
|
||||
return
|
||||
|
||||
def _process_inline_text(
|
||||
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
||||
):
|
||||
txt = " ".join(self.inline_texts)
|
||||
if len(txt) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=parent_item,
|
||||
text=txt,
|
||||
)
|
||||
self.inline_texts = []
|
||||
|
||||
def _iterate_elements( # noqa: C901
|
||||
self,
|
||||
*,
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
doc: DoclingDocument,
|
||||
visited: Set[marko.element.Element],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
formatting: Optional[Formatting] = None,
|
||||
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||
):
|
||||
if element in visited:
|
||||
return
|
||||
@ -183,43 +173,31 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Check for different element types and process relevant details
|
||||
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(
|
||||
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||
)
|
||||
if element.level == 1:
|
||||
doc_label = DocItemLabel.TITLE
|
||||
|
||||
if len(element.children) == 1:
|
||||
child = element.children[0]
|
||||
snippet_text = str(child.children) # type: ignore
|
||||
visited.add(child)
|
||||
else:
|
||||
doc_label = DocItemLabel.SECTION_HEADER
|
||||
snippet_text = "" # inline group will be created
|
||||
|
||||
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
||||
# hence we need to traverse the tree to get full text of a header
|
||||
strings: List[str] = []
|
||||
|
||||
# Define a recursive function to traverse the tree
|
||||
def traverse(node: marko.block.BlockElement):
|
||||
# Check if the node has a "children" attribute
|
||||
if hasattr(node, "children"):
|
||||
# If "children" is a list, continue traversal
|
||||
if isinstance(node.children, list):
|
||||
for child in node.children:
|
||||
traverse(child)
|
||||
# If "children" is text, add it to header text
|
||||
elif isinstance(node.children, str):
|
||||
strings.append(node.children)
|
||||
|
||||
traverse(element)
|
||||
snippet_text = "".join(strings)
|
||||
if len(snippet_text) > 0:
|
||||
if doc_label == DocItemLabel.SECTION_HEADER:
|
||||
if element.level == 1:
|
||||
parent_item = doc.add_title(
|
||||
text=snippet_text,
|
||||
parent=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
parent_item = doc.add_heading(
|
||||
text=snippet_text,
|
||||
level=element.level - 1,
|
||||
parent=parent_item,
|
||||
)
|
||||
else:
|
||||
parent_item = doc.add_text(
|
||||
label=doc_label, parent=parent_item, text=snippet_text
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
break
|
||||
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif (
|
||||
isinstance(element, marko.block.ListItem)
|
||||
and len(element.children) > 0
|
||||
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
|
||||
and len(element.children) == 1
|
||||
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
||||
and len(child.children) > 0
|
||||
):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
snippet_text = str(first_child.children[0].children) # type: ignore
|
||||
is_numbered = False
|
||||
if (
|
||||
parent_item is not None
|
||||
and isinstance(parent_item, DocItem)
|
||||
and parent_item.label == GroupLabel.ORDERED_LIST
|
||||
):
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
||||
if len(child.children) == 1:
|
||||
snippet_text = str(child.children[0].children) # type: ignore
|
||||
visited.add(child)
|
||||
else:
|
||||
snippet_text = "" # inline group will be created
|
||||
is_numbered = isinstance(parent_item, OrderedList)
|
||||
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
||||
_log.warning("ListItem would have not had a list parent, adding one.")
|
||||
parent_item = doc.add_unordered_list(parent=parent_item)
|
||||
parent_item = doc.add_list_item(
|
||||
enumerated=is_numbered,
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
visited.add(first_child)
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||
|
||||
fig_caption: Optional[TextItem] = None
|
||||
if element.title is not None and element.title != "":
|
||||
fig_caption = doc.add_text(
|
||||
label=DocItemLabel.CAPTION, text=element.title
|
||||
label=DocItemLabel.CAPTION,
|
||||
text=element.title,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
doc.add_picture(parent=parent_item, caption=fig_caption)
|
||||
|
||||
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
||||
self._process_inline_text(parent_item, doc)
|
||||
elif isinstance(element, marko.inline.Emphasis):
|
||||
_log.debug(f" - Emphasis: {element.children}")
|
||||
formatting = deepcopy(formatting) if formatting else Formatting()
|
||||
formatting.italic = True
|
||||
|
||||
elif isinstance(element, marko.inline.StrongEmphasis):
|
||||
_log.debug(f" - StrongEmphasis: {element.children}")
|
||||
formatting = deepcopy(formatting) if formatting else Formatting()
|
||||
formatting.bold = True
|
||||
|
||||
elif isinstance(element, marko.inline.Link):
|
||||
_log.debug(f" - Link: {element.children}")
|
||||
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
|
||||
element.dest
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.RawText):
|
||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
||||
else:
|
||||
self.md_table_buffer.append(snippet_text)
|
||||
else:
|
||||
elif snippet_text:
|
||||
self._close_table(doc)
|
||||
# most likely just inline text
|
||||
self.inline_texts.append(str(element.children))
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.CodeSpan):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Code Span: {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_code(parent=parent_item, text=snippet_text)
|
||||
doc.add_code(
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif (
|
||||
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
||||
and len(element.children) > 0
|
||||
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
||||
and len(snippet_text := (first_child.children.strip())) > 0
|
||||
and isinstance((child := element.children[0]), marko.inline.RawText)
|
||||
and len(snippet_text := (child.children.strip())) > 0
|
||||
):
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
doc.add_code(parent=parent_item, text=snippet_text)
|
||||
doc.add_code(
|
||||
parent=parent_item,
|
||||
text=snippet_text,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.LineBreak):
|
||||
if self.in_table:
|
||||
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif isinstance(element, marko.block.HTMLBlock):
|
||||
self._html_blocks += 1
|
||||
self._process_inline_text(parent_item, doc)
|
||||
self._close_table(doc)
|
||||
_log.debug(f"HTML Block: {element}")
|
||||
if (
|
||||
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# wrap in markers to enable post-processing in convert()
|
||||
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
||||
doc.add_code(parent=parent_item, text=text_to_add)
|
||||
doc.add_code(
|
||||
parent=parent_item,
|
||||
text=text_to_add,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self._close_table(doc)
|
||||
_log.debug(f"Some other element: {element}")
|
||||
|
||||
if (
|
||||
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
||||
and len(element.children) > 1
|
||||
):
|
||||
parent_item = doc.add_inline_group(parent=parent_item)
|
||||
|
||||
processed_block_types = (
|
||||
marko.block.Heading,
|
||||
# marko.block.Heading,
|
||||
marko.block.CodeBlock,
|
||||
marko.block.FencedCode,
|
||||
marko.inline.RawText,
|
||||
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc=doc,
|
||||
visited=visited,
|
||||
parent_item=parent_item,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent_item=None,
|
||||
visited=set(),
|
||||
)
|
||||
self._process_inline_text(None, doc) # handle last hanging inline text
|
||||
self._close_table(doc=doc) # handle any last hanging table
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
|
@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from pptx import Presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||
from pptx.oxml.text import CT_TextLineBreak
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
@ -120,97 +121,72 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
return prov
|
||||
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
||||
is_a_list = False
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||
is_list_group_created = False
|
||||
enum_list_item_value = 0
|
||||
new_list = None
|
||||
bullet_type = "None"
|
||||
list_label = GroupLabel.LIST
|
||||
doc_label = DocItemLabel.LIST_ITEM
|
||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
||||
|
||||
# Identify if shape contains lists
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
# Check if paragraph is a bullet point using the `element` XML
|
||||
def is_list_item(paragraph):
|
||||
"""Check if the paragraph is a list item."""
|
||||
p = paragraph._element
|
||||
if (
|
||||
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Bullet"
|
||||
is_a_list = True
|
||||
return (True, "Bullet")
|
||||
elif (
|
||||
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Numbered"
|
||||
is_a_list = True
|
||||
else:
|
||||
is_a_list = False
|
||||
|
||||
if paragraph.level > 0:
|
||||
return (True, "Numbered")
|
||||
elif paragraph.level > 0:
|
||||
# Most likely a sub-list
|
||||
is_a_list = True
|
||||
|
||||
if is_a_list:
|
||||
# Determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
if bullet_type == "Numbered":
|
||||
list_label = GroupLabel.ORDERED_LIST
|
||||
|
||||
if is_a_list:
|
||||
_log.debug("LIST DETECTED!")
|
||||
return (True, "None")
|
||||
else:
|
||||
_log.debug("No List")
|
||||
|
||||
# If there is a list inside of the shape, create a new docling list to assign list items to
|
||||
# if is_a_list:
|
||||
# new_list = doc.add_group(
|
||||
# label=list_label, name=f"list", parent=parent_slide
|
||||
# )
|
||||
return (False, "None")
|
||||
|
||||
# Iterate through paragraphs to build up text
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
# p_text = paragraph.text.strip()
|
||||
is_a_list, bullet_type = is_list_item(paragraph)
|
||||
p = paragraph._element
|
||||
enum_list_item_value += 1
|
||||
inline_paragraph_text = ""
|
||||
inline_list_item_text = ""
|
||||
|
||||
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
||||
if len(e.text.strip()) > 0:
|
||||
e_is_a_list_item = False
|
||||
is_numbered = False
|
||||
if (
|
||||
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Bullet"
|
||||
e_is_a_list_item = True
|
||||
elif (
|
||||
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Numbered"
|
||||
is_numbered = True
|
||||
e_is_a_list_item = True
|
||||
# Convert line breaks to spaces and accumulate text
|
||||
p_text = ""
|
||||
for e in p.content_children:
|
||||
if isinstance(e, CT_TextLineBreak):
|
||||
p_text += " "
|
||||
else:
|
||||
e_is_a_list_item = False
|
||||
p_text += e.text
|
||||
|
||||
if e_is_a_list_item:
|
||||
if len(inline_paragraph_text) > 0:
|
||||
# output accumulated inline text:
|
||||
doc.add_text(
|
||||
label=doc_label,
|
||||
if is_a_list:
|
||||
enum_marker = ""
|
||||
enumerated = bullet_type == "Numbered"
|
||||
|
||||
if not is_list_group_created:
|
||||
new_list = doc.add_group(
|
||||
label=GroupLabel.ORDERED_LIST
|
||||
if enumerated
|
||||
else GroupLabel.LIST,
|
||||
name="list",
|
||||
parent=parent_slide,
|
||||
text=inline_paragraph_text,
|
||||
)
|
||||
is_list_group_created = True
|
||||
enum_list_item_value = 0
|
||||
|
||||
if enumerated:
|
||||
enum_list_item_value += 1
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=enumerated,
|
||||
parent=new_list,
|
||||
text=p_text,
|
||||
prov=prov,
|
||||
)
|
||||
# Set marker and enumerated arguments if this is an enumeration element.
|
||||
inline_list_item_text += e.text
|
||||
# print(e.text)
|
||||
else:
|
||||
else: # is paragraph not a list item
|
||||
# Assign proper label to the text, depending if it's a Title or Section Header
|
||||
# For other types of text, assign - PARAGRAPH
|
||||
doc_label = DocItemLabel.PARAGRAPH
|
||||
@ -224,32 +200,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
doc_label = DocItemLabel.TITLE
|
||||
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
||||
DocItemLabel.SECTION_HEADER
|
||||
enum_list_item_value = 0
|
||||
inline_paragraph_text += e.text
|
||||
|
||||
if len(inline_paragraph_text) > 0:
|
||||
# output accumulated inline text:
|
||||
doc.add_text(
|
||||
label=doc_label,
|
||||
parent=parent_slide,
|
||||
text=inline_paragraph_text,
|
||||
prov=prov,
|
||||
)
|
||||
|
||||
if len(inline_list_item_text) > 0:
|
||||
enum_marker = ""
|
||||
if is_numbered:
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
if not is_list_group_created:
|
||||
new_list = doc.add_group(
|
||||
label=list_label, name="list", parent=parent_slide
|
||||
)
|
||||
is_list_group_created = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_list,
|
||||
text=inline_list_item_text,
|
||||
text=p_text,
|
||||
prov=prov,
|
||||
)
|
||||
return
|
||||
|
@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from docx import Document
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.oxml.table import CT_Tc
|
||||
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._handle_tables(element, docx_obj, doc)
|
||||
except Exception:
|
||||
_log.debug("could not parse a table, broken docx table")
|
||||
|
||||
# Check for Image
|
||||
elif drawing_blip:
|
||||
self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||
# Check for Text after the Image
|
||||
if (
|
||||
tag_name in ["p"]
|
||||
and element.find(".//w:t", namespaces=namespaces) is not None
|
||||
):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
# Check for the sdt containers, like table of contents
|
||||
elif tag_name in ["sdt"]:
|
||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
|
||||
return doc
|
||||
|
||||
def _str_to_int(
|
||||
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
all_paragraphs = []
|
||||
|
||||
# Sort paragraphs within each container, then process containers
|
||||
for container_id, paragraphs in container_paragraphs.items():
|
||||
for paragraphs in container_paragraphs.values():
|
||||
# Sort by vertical position within each container
|
||||
sorted_container_paragraphs = sorted(
|
||||
paragraphs,
|
||||
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
element=element, text=paragraph.text
|
||||
)
|
||||
|
||||
if text is None:
|
||||
return
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text = text.strip()
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return
|
||||
|
||||
def _add_formatted_list_item(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
elements: list,
|
||||
marker: str,
|
||||
enumerated: bool,
|
||||
level: int,
|
||||
) -> None:
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
||||
return
|
||||
if len(elements) == 1:
|
||||
text, format, hyperlink = elements[0]
|
||||
doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
new_item = doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text="",
|
||||
)
|
||||
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
def _add_list_item(
|
||||
self,
|
||||
*,
|
||||
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> None:
|
||||
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
if not elements:
|
||||
return None
|
||||
enum_marker = ""
|
||||
|
||||
level = self._get_level()
|
||||
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
@ -981,20 +1016,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
@ -1002,7 +1029,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
and prev_indent is not None
|
||||
and ilevel < prev_indent
|
||||
): # Close list
|
||||
for k, v in self.parents.items():
|
||||
for k in self.parents:
|
||||
if k > self.level_at_new_list + ilevel:
|
||||
self.parents[k] = None
|
||||
|
||||
@ -1011,19 +1038,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
self.listIter = 0
|
||||
|
||||
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level - 1],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
# Add the list item to the parent group
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level - 1
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
def _handle_tables(
|
||||
|
@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
PdfPageBoundaryType,
|
||||
PdfPageGeometry,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
|
||||
def get_pdf_page_geometry(
|
||||
ppage: pdfium.PdfPage,
|
||||
angle: float = 0.0,
|
||||
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
|
||||
) -> PdfPageGeometry:
|
||||
"""
|
||||
Create PdfPageGeometry from a pypdfium2 PdfPage object.
|
||||
|
||||
Args:
|
||||
ppage: pypdfium2 PdfPage object
|
||||
angle: Page rotation angle in degrees (default: 0.0)
|
||||
boundary_type: The boundary type for the page (default: CROP_BOX)
|
||||
|
||||
Returns:
|
||||
PdfPageGeometry with all the different bounding boxes properly set
|
||||
"""
|
||||
with pypdfium2_lock:
|
||||
# Get the main bounding box (intersection of crop_box and media_box)
|
||||
bbox_tuple = ppage.get_bbox()
|
||||
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
|
||||
# Get all the different page boxes from pypdfium2
|
||||
media_box_tuple = ppage.get_mediabox()
|
||||
crop_box_tuple = ppage.get_cropbox()
|
||||
art_box_tuple = ppage.get_artbox()
|
||||
bleed_box_tuple = ppage.get_bleedbox()
|
||||
trim_box_tuple = ppage.get_trimbox()
|
||||
|
||||
# Convert to BoundingBox objects using existing from_tuple method
|
||||
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
||||
# Use bbox as fallback when specific box types are not defined
|
||||
media_bbox = (
|
||||
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if media_box_tuple
|
||||
else bbox
|
||||
)
|
||||
crop_bbox = (
|
||||
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if crop_box_tuple
|
||||
else bbox
|
||||
)
|
||||
art_bbox = (
|
||||
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if art_box_tuple
|
||||
else bbox
|
||||
)
|
||||
bleed_bbox = (
|
||||
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if bleed_box_tuple
|
||||
else bbox
|
||||
)
|
||||
trim_bbox = (
|
||||
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if trim_box_tuple
|
||||
else bbox
|
||||
)
|
||||
|
||||
return PdfPageGeometry(
|
||||
angle=angle,
|
||||
rect=BoundingRectangle.from_bounding_box(bbox),
|
||||
boundary_type=boundary_type,
|
||||
art_bbox=art_bbox,
|
||||
bleed_bbox=bleed_bbox,
|
||||
crop_bbox=crop_bbox,
|
||||
media_bbox=media_bbox,
|
||||
trim_bbox=trim_bbox,
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
page_size = self.get_size()
|
||||
with pypdfium2_lock:
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height=page_size.height)
|
||||
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
yield cropbox
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||
|
||||
with pypdfium2_lock:
|
||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from pypdfium."""
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return merged_cells
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
return merge_horizontal_cells(cells)
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
page_size = self.get_size()
|
||||
with pypdfium2_lock:
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height=page_size.height)
|
||||
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
yield cropbox
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||
|
||||
with pypdfium2_lock:
|
||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
if not self.valid:
|
||||
return None
|
||||
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
# Get the PDF page geometry from pypdfium2
|
||||
dimension = get_pdf_page_geometry(self._ppage)
|
||||
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_textlines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
cells = merge_horizontal_cells(cells)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
|
@ -235,7 +235,6 @@ class Page(BaseModel):
|
||||
page_no: int
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[TextCell] = []
|
||||
parsed_page: Optional[SegmentedPdfPage] = None
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
@ -248,12 +247,27 @@ class Page(BaseModel):
|
||||
float, Image
|
||||
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
||||
|
||||
@property
|
||||
def cells(self) -> List[TextCell]:
|
||||
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
||||
if self.parsed_page is not None:
|
||||
return self.parsed_page.textline_cells
|
||||
else:
|
||||
return []
|
||||
|
||||
def get_image(
|
||||
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
||||
self,
|
||||
scale: float = 1.0,
|
||||
max_size: Optional[int] = None,
|
||||
cropbox: Optional[BoundingBox] = None,
|
||||
) -> Optional[Image]:
|
||||
if self._backend is None:
|
||||
return self._image_cache.get(scale, None)
|
||||
|
||||
if max_size:
|
||||
assert self.size is not None
|
||||
scale = min(scale, max_size / max(self.size.as_tuple()))
|
||||
|
||||
if scale not in self._image_cache:
|
||||
if cropbox is None:
|
||||
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
||||
|
@ -302,7 +302,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
),
|
||||
)
|
||||
|
||||
generate_parsed_pages: bool = False
|
||||
generate_parsed_pages: Literal[True] = (
|
||||
True # Always True since parsed_page is now mandatory
|
||||
)
|
||||
|
||||
|
||||
class ProcessingPipeline(str, Enum):
|
||||
|
@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
class BaseVlmOptions(BaseModel):
|
||||
kind: str
|
||||
prompt: str
|
||||
scale: float = 2.0
|
||||
max_size: Optional[int] = None
|
||||
|
||||
|
||||
class ResponseFormat(str, Enum):
|
||||
@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
||||
AcceleratorDevice.MPS,
|
||||
]
|
||||
|
||||
scale: float = 2.0
|
||||
|
||||
temperature: float = 0.0
|
||||
stop_strings: List[str] = []
|
||||
extra_generation_config: Dict[str, Any] = {}
|
||||
@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
|
||||
) # Default to ollama
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
scale: float = 2.0
|
||||
timeout: float = 60
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
||||
|
@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
|
||||
with TimeRecorder(conv_res, "vlm"):
|
||||
assert page.size is not None
|
||||
|
||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
||||
hi_res_image = page.get_image(
|
||||
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||
)
|
||||
assert hi_res_image is not None
|
||||
if hi_res_image:
|
||||
if hi_res_image.mode != "RGB":
|
||||
|
@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
|
||||
coord_origin=bbox.coord_origin,
|
||||
)
|
||||
|
||||
page_ix = element_prov.page_no - 1
|
||||
page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
|
||||
cropped_image = conv_res.pages[page_ix].get_image(
|
||||
scale=self.images_scale, cropbox=expanded_bbox
|
||||
)
|
||||
|
@ -7,6 +7,7 @@ from typing import List, Optional, Type
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import binary_dilation, find_objects, label
|
||||
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
||||
return []
|
||||
|
||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||
def _filter_ocr_cells(
|
||||
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
|
||||
) -> List[TextCell]:
|
||||
# Create R-tree index for programmatic cells
|
||||
p = index.Property()
|
||||
p.dimension = 2
|
||||
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
||||
]
|
||||
return filtered_ocr_cells
|
||||
|
||||
def post_process_cells(self, ocr_cells, programmatic_cells):
|
||||
def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
|
||||
r"""
|
||||
Post-process the ocr and programmatic cells and return the final list of of cells
|
||||
Post-process the OCR cells and update the page object.
|
||||
Updates parsed_page.textline_cells directly since page.cells is now read-only.
|
||||
"""
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
cells = ocr_cells
|
||||
return cells
|
||||
# Get existing cells from the read-only property
|
||||
existing_cells = page.cells
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
|
||||
programmatic_cells.extend(filtered_ocr_cells)
|
||||
return programmatic_cells
|
||||
# Combine existing and OCR cells with overlap filtering
|
||||
final_cells = self._combine_cells(existing_cells, ocr_cells)
|
||||
|
||||
assert page.parsed_page is not None
|
||||
|
||||
# Update parsed_page.textline_cells directly
|
||||
page.parsed_page.textline_cells = final_cells
|
||||
page.parsed_page.has_lines = len(final_cells) > 0
|
||||
|
||||
def _combine_cells(
|
||||
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
|
||||
) -> List[TextCell]:
|
||||
"""Combine existing and OCR cells with filtering and re-indexing."""
|
||||
if self.options.force_full_page_ocr:
|
||||
combined = ocr_cells
|
||||
else:
|
||||
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
|
||||
combined = list(existing_cells) + filtered_ocr_cells
|
||||
|
||||
# Re-index in-place
|
||||
for i, cell in enumerate(combined):
|
||||
cell.index = i
|
||||
|
||||
return combined
|
||||
|
||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||
image = copy.deepcopy(page.image)
|
||||
|
@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
|
||||
# Apply postprocessing
|
||||
|
||||
processed_clusters, processed_cells = LayoutPostprocessor(
|
||||
page.cells, clusters, page.size
|
||||
page, clusters
|
||||
).postprocess()
|
||||
# processed_clusters, processed_cells = clusters, page.cells
|
||||
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
|
||||
)
|
||||
)
|
||||
|
||||
page.cells = processed_cells
|
||||
page.predictions.layout = LayoutPrediction(
|
||||
clusters=processed_clusters
|
||||
)
|
||||
|
@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -2,7 +2,7 @@ import re
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
from PIL import ImageDraw
|
||||
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
images_scale: Optional[float]
|
||||
create_parsed_page: bool
|
||||
|
||||
|
||||
class PagePreprocessingModel(BasePageModel):
|
||||
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
|
||||
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
assert page._backend is not None
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
if self.options.create_parsed_page:
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
assert page.parsed_page is not None
|
||||
|
||||
# Rate the text quality from the PDF parser, and aggregate on page
|
||||
text_scores = []
|
||||
|
@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -124,7 +124,7 @@ class ReadingOrderModel:
|
||||
page_no = page.page_no + 1
|
||||
size = page.size
|
||||
|
||||
assert size is not None
|
||||
assert size is not None, "Page size is not initialized."
|
||||
|
||||
out_doc.add_page(page_no=page_no, size=size)
|
||||
|
||||
|
@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
with TimeRecorder(conv_res, "vlm"):
|
||||
assert page.size is not None
|
||||
|
||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
||||
hi_res_image = page.get_image(
|
||||
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||
)
|
||||
|
||||
# Define prompt structure
|
||||
prompt = self.formulate_prompt()
|
||||
|
@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
||||
assert page.size is not None
|
||||
|
||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
||||
hi_res_image = page.get_image(
|
||||
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||
)
|
||||
if hi_res_image is not None:
|
||||
im_width, im_height = hi_res_image.size
|
||||
|
||||
|
@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
)
|
||||
raise e
|
||||
|
||||
# Filter out uninitialized pages (those with size=None) that may remain
|
||||
# after timeout or processing failures to prevent assertion errors downstream
|
||||
initial_page_count = len(conv_res.pages)
|
||||
conv_res.pages = [page for page in conv_res.pages if page.size is not None]
|
||||
|
||||
if len(conv_res.pages) < initial_page_count:
|
||||
_log.info(
|
||||
f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
|
||||
f"due to timeout or processing failures"
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
def _unload(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
PagePreprocessingModel(
|
||||
options=PagePreprocessingOptions(
|
||||
images_scale=pipeline_options.images_scale,
|
||||
create_parsed_page=pipeline_options.generate_parsed_pages,
|
||||
)
|
||||
),
|
||||
# OCR
|
||||
|
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from rtree import index
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster, Page
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -194,11 +194,11 @@ class LayoutPostprocessor:
|
||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||
}
|
||||
|
||||
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
|
||||
"""Initialize processor with cells and clusters."""
|
||||
"""Initialize processor with cells and spatial indices."""
|
||||
self.cells = cells
|
||||
self.page_size = page_size
|
||||
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
|
||||
"""Initialize processor with page and clusters."""
|
||||
self.cells = page.cells
|
||||
self.page = page
|
||||
self.page_size = page.size
|
||||
self.all_clusters = clusters
|
||||
self.regular_clusters = [
|
||||
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
||||
@ -240,6 +240,10 @@ class LayoutPostprocessor:
|
||||
for child in cluster.children:
|
||||
child.cells = self._sort_cells(child.cells)
|
||||
|
||||
assert self.page.parsed_page is not None
|
||||
self.page.parsed_page.textline_cells = self.cells
|
||||
self.page.parsed_page.has_lines = len(self.cells) > 0
|
||||
|
||||
return final_clusters, self.cells
|
||||
|
||||
def _process_regular_clusters(self) -> List[Cluster]:
|
||||
@ -301,6 +305,7 @@ class LayoutPostprocessor:
|
||||
special_clusters = self._handle_cross_type_overlaps(special_clusters)
|
||||
|
||||
# Calculate page area from known page size
|
||||
assert self.page_size is not None
|
||||
page_area = self.page_size.width * self.page_size.height
|
||||
if page_area > 0:
|
||||
# Filter out full-page pictures
|
||||
|
11
docs/examples/batch_convert.py
vendored
11
docs/examples/batch_convert.py
vendored
@ -121,14 +121,15 @@ def export_documents(
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/pdf/2206.01062.pdf"),
|
||||
Path("./tests/data/pdf/2203.01017v2.pdf"),
|
||||
Path("./tests/data/pdf/2305.03393v1.pdf"),
|
||||
Path("./tests/data/pdf/redp5110_sampled.pdf"),
|
||||
data_folder / "pdf/2206.01062.pdf",
|
||||
data_folder / "pdf/2203.01017v2.pdf",
|
||||
data_folder / "pdf/2305.03393v1.pdf",
|
||||
data_folder / "pdf/redp5110_sampled.pdf",
|
||||
]
|
||||
|
||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||
# buf = BytesIO((data_folder / "pdf/2206.01062.pdf").open("rb").read())
|
||||
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
|
||||
# input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
|
3
docs/examples/custom_convert.py
vendored
3
docs/examples/custom_convert.py
vendored
@ -16,7 +16,8 @@ _log = logging.getLogger(__name__)
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
|
||||
###########################################################################
|
||||
|
||||
|
@ -71,7 +71,8 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2203.01017v2.pdf"
|
||||
|
||||
pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
|
||||
pipeline_options.do_formula_understanding = True
|
||||
|
3
docs/examples/develop_picture_enrichment.py
vendored
3
docs/examples/develop_picture_enrichment.py
vendored
@ -76,7 +76,8 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
|
||||
pipeline_options = ExamplePictureClassifierPipelineOptions()
|
||||
pipeline_options.images_scale = 2.0
|
||||
|
3
docs/examples/export_figures.py
vendored
3
docs/examples/export_figures.py
vendored
@ -16,7 +16,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
|
3
docs/examples/export_multimodal.py
vendored
3
docs/examples/export_multimodal.py
vendored
@ -19,7 +19,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
|
3
docs/examples/export_tables.py
vendored
3
docs/examples/export_tables.py
vendored
@ -12,7 +12,8 @@ _log = logging.getLogger(__name__)
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
output_dir = Path("scratch")
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
5
docs/examples/full_page_ocr.py
vendored
5
docs/examples/full_page_ocr.py
vendored
@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
@ -32,7 +33,7 @@ def main():
|
||||
}
|
||||
)
|
||||
|
||||
doc = converter.convert(input_doc).document
|
||||
doc = converter.convert(input_doc_path).document
|
||||
md = doc.export_to_markdown()
|
||||
print(md)
|
||||
|
||||
|
3
docs/examples/pictures_description_api.py
vendored
3
docs/examples/pictures_description_api.py
vendored
@ -96,7 +96,8 @@ def watsonx_vlm_options():
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
enable_remote_services=True # <-- this is required!
|
||||
|
5
docs/examples/run_with_accelerator.py
vendored
5
docs/examples/run_with_accelerator.py
vendored
@ -10,7 +10,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
|
||||
# Explicitly set the accelerator
|
||||
# accelerator_options = AcceleratorOptions(
|
||||
@ -47,7 +48,7 @@ def main():
|
||||
settings.debug.profile_pipeline_timings = True
|
||||
|
||||
# Convert the document
|
||||
conversion_result = converter.convert(input_doc)
|
||||
conversion_result = converter.convert(input_doc_path)
|
||||
doc = conversion_result.document
|
||||
|
||||
# List with total time per document
|
||||
|
5
docs/examples/tesseract_lang_detection.py
vendored
5
docs/examples/tesseract_lang_detection.py
vendored
@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
input_doc = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
|
||||
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
|
||||
# ocr_options = TesseractOcrOptions(lang=["auto"])
|
||||
@ -27,7 +28,7 @@ def main():
|
||||
}
|
||||
)
|
||||
|
||||
doc = converter.convert(input_doc).document
|
||||
doc = converter.convert(input_doc_path).document
|
||||
md = doc.export_to_markdown()
|
||||
print(md)
|
||||
|
||||
|
3
docs/examples/translate.py
vendored
3
docs/examples/translate.py
vendored
@ -30,7 +30,8 @@ def translate(text: str, src: str = "en", dest: str = "de"):
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
|
4
docs/examples/vlm_pipeline_api_model.py
vendored
4
docs/examples/vlm_pipeline_api_model.py
vendored
@ -95,8 +95,8 @@ def watsonx_vlm_options(model: str, prompt: str):
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
|
||||
data_folder = Path(__file__).parent / "../../tests/data"
|
||||
input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf"
|
||||
|
||||
pipeline_options = VlmPipelineOptions(
|
||||
enable_remote_services=True # <-- this is required!
|
||||
|
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "docling"
|
||||
version = "2.36.1" # DO NOT EDIT, updated automatically
|
||||
version = "2.37.0" # DO NOT EDIT, updated automatically
|
||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||
license = "MIT"
|
||||
keywords = [
|
||||
|
29
tests/data/asciidoc/test_03.asciidoc
vendored
Normal file
29
tests/data/asciidoc/test_03.asciidoc
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
:_mod-docs-content-type: PROCEDURE
|
||||
:experimental:
|
||||
|
||||
[id="renaming-a-bookmark_{context}"]
|
||||
= Renaming a bookmark
|
||||
|
||||
You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
|
||||
|
||||
Renaming the bookmark does not rename the folder.
|
||||
|
||||
.Procedure
|
||||
|
||||
. Right-click the bookmark in the side bar.
|
||||
|
||||
. Select *Rename…*.
|
||||
+
|
||||
image::rename-bookmark-menu.png[Rename bookmark menu]
|
||||
|
||||
. In the *Name* field, enter the new name for the bookmark.
|
||||
+
|
||||
image::rename-bookmark-text.png[Bookmark name field]
|
||||
|
||||
. Click btn:[Rename].
|
||||
|
||||
.Verification
|
||||
|
||||
* Check that the side bar lists the bookmark under the new name.
|
||||
+
|
||||
image::renamed-bookmark.png[Renamed bookmark]
|
BIN
tests/data/docx/word_image_anchors.docx
vendored
Normal file
BIN
tests/data/docx/word_image_anchors.docx
vendored
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -10607,7 +10661,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -83405,7 +83464,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -87282,7 +87395,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -100075,7 +100193,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -103502,7 +103674,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -116054,7 +116231,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -122106,7 +122337,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -179742,7 +179978,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -182669,7 +182959,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -193709,7 +194004,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -198736,7 +199085,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -236872,7 +237226,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -242249,7 +242657,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -289112,7 +289525,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -294464,7 +294931,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -327043,7 +327515,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -329120,7 +329646,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -2632,7 +2686,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,61 @@
|
||||
"width": 594.0,
|
||||
"height": 774.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 594.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 594.0,
|
||||
"r_y2": 774.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 774.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -2457,7 +2511,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -1032,7 +1086,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -4591,7 +4650,61 @@
|
||||
"width": 595.2760009765625,
|
||||
"height": 841.8900146484375
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2760009765625,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2760009765625,
|
||||
"r_y2": 841.8900146484375,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.8900146484375,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -5768,7 +5881,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -1057,7 +1111,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -5032,7 +5091,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -5734,7 +5847,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -8374,7 +8492,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -9676,7 +9848,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -14401,7 +14578,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -15928,7 +16159,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -21385,7 +21621,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -21512,7 +21802,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -657,7 +711,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -2982,7 +3041,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -3609,7 +3722,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -657,7 +711,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -3982,7 +4036,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.3200073242188,
|
||||
"height": 842.0399780273438
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.3200073242188,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.3200073242188,
|
||||
"r_y2": 842.0399780273438,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 842.0399780273438,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -1382,7 +1436,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -10607,7 +10661,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -83405,7 +83464,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -87282,7 +87395,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -100075,7 +100193,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -103502,7 +103674,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -116054,7 +116231,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -122106,7 +122337,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -179742,7 +179978,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -182669,7 +182959,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -193709,7 +194004,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -198736,7 +199085,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -236872,7 +237226,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -242249,7 +242657,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -289112,7 +289525,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -294464,7 +294931,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -327043,7 +327515,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -329120,7 +329646,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -2632,7 +2686,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,61 @@
|
||||
"width": 594.0,
|
||||
"height": 774.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 594.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 594.0,
|
||||
"r_y2": 774.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 774.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 774.0,
|
||||
"r": 594.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -2457,7 +2511,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -1032,7 +1086,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -4591,7 +4650,61 @@
|
||||
"width": 595.2760009765625,
|
||||
"height": 841.8900146484375
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2760009765625,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2760009765625,
|
||||
"r_y2": 841.8900146484375,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.8900146484375,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.8900146484375,
|
||||
"r": 595.2760009765625,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -5768,7 +5881,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
20
tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
vendored
Normal file
20
tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
# Contribution guideline example
|
||||
|
||||
This is simple.
|
||||
|
||||
Foo *emphasis* **strong emphasis** ***both*** .
|
||||
|
||||
Create your feature branch: `git checkout -b feature/AmazingFeature` .
|
||||
|
||||
1. Pull the [**repository**](https://github.com/docling-project/docling) .
|
||||
2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
|
||||
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
|
||||
4. Push to the branch ( `git push origin feature/AmazingFeature` )
|
||||
5. Open a Pull Request
|
||||
|
||||
##
|
||||
|
||||
*Second* section
|
||||
|
||||
- **First** : Lorem ipsum.
|
||||
- **Second** : Dolor `sit` amet.
|
565
tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
vendored
Normal file
565
tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
vendored
Normal file
@ -0,0 +1,565 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/groups/0'
|
||||
- $ref: '#/groups/1'
|
||||
- $ref: '#/groups/2'
|
||||
- $ref: '#/texts/27'
|
||||
- $ref: '#/groups/8'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups:
|
||||
- children:
|
||||
- $ref: '#/texts/2'
|
||||
- $ref: '#/texts/3'
|
||||
- $ref: '#/texts/4'
|
||||
- $ref: '#/texts/5'
|
||||
- $ref: '#/texts/6'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/0'
|
||||
- children:
|
||||
- $ref: '#/texts/7'
|
||||
- $ref: '#/texts/8'
|
||||
- $ref: '#/texts/9'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/1'
|
||||
- children:
|
||||
- $ref: '#/texts/10'
|
||||
- $ref: '#/texts/14'
|
||||
- $ref: '#/texts/18'
|
||||
- $ref: '#/texts/22'
|
||||
- $ref: '#/texts/26'
|
||||
content_layer: body
|
||||
label: ordered_list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/2'
|
||||
- children:
|
||||
- $ref: '#/texts/11'
|
||||
- $ref: '#/texts/12'
|
||||
- $ref: '#/texts/13'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/10'
|
||||
self_ref: '#/groups/3'
|
||||
- children:
|
||||
- $ref: '#/texts/15'
|
||||
- $ref: '#/texts/16'
|
||||
- $ref: '#/texts/17'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/14'
|
||||
self_ref: '#/groups/4'
|
||||
- children:
|
||||
- $ref: '#/texts/19'
|
||||
- $ref: '#/texts/20'
|
||||
- $ref: '#/texts/21'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/18'
|
||||
self_ref: '#/groups/5'
|
||||
- children:
|
||||
- $ref: '#/texts/23'
|
||||
- $ref: '#/texts/24'
|
||||
- $ref: '#/texts/25'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/22'
|
||||
self_ref: '#/groups/6'
|
||||
- children:
|
||||
- $ref: '#/texts/28'
|
||||
- $ref: '#/texts/29'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/27'
|
||||
self_ref: '#/groups/7'
|
||||
- children:
|
||||
- $ref: '#/texts/30'
|
||||
- $ref: '#/texts/33'
|
||||
content_layer: body
|
||||
label: list
|
||||
name: list
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
self_ref: '#/groups/8'
|
||||
- children:
|
||||
- $ref: '#/texts/31'
|
||||
- $ref: '#/texts/32'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/30'
|
||||
self_ref: '#/groups/9'
|
||||
- children:
|
||||
- $ref: '#/texts/34'
|
||||
- $ref: '#/texts/35'
|
||||
- $ref: '#/texts/36'
|
||||
- $ref: '#/texts/37'
|
||||
content_layer: body
|
||||
label: inline
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/texts/33'
|
||||
self_ref: '#/groups/10'
|
||||
key_value_items: []
|
||||
name: inline_and_formatting
|
||||
origin:
|
||||
binary_hash: 9342273634728023910
|
||||
filename: inline_and_formatting.md
|
||||
mimetype: text/markdown
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables: []
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: title
|
||||
orig: Contribution guideline example
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: Contribution guideline example
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: This is simple.
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: This is simple.
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: Foo
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: Foo
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: false
|
||||
italic: true
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: emphasis
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/3'
|
||||
text: emphasis
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: true
|
||||
italic: false
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: strong emphasis
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/4'
|
||||
text: strong emphasis
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: true
|
||||
italic: true
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: both
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/5'
|
||||
text: both
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: .
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/6'
|
||||
text: .
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: 'Create your feature branch:'
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/7'
|
||||
text: 'Create your feature branch:'
|
||||
- captions: []
|
||||
children: []
|
||||
code_language: unknown
|
||||
content_layer: body
|
||||
footnotes: []
|
||||
label: code
|
||||
orig: git checkout -b feature/AmazingFeature
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/8'
|
||||
text: git checkout -b feature/AmazingFeature
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: .
|
||||
parent:
|
||||
$ref: '#/groups/1'
|
||||
prov: []
|
||||
self_ref: '#/texts/9'
|
||||
text: .
|
||||
- children:
|
||||
- $ref: '#/groups/3'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: '-'
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/10'
|
||||
text: ''
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: Pull the
|
||||
parent:
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/11'
|
||||
text: Pull the
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: true
|
||||
italic: false
|
||||
strikethrough: false
|
||||
underline: false
|
||||
hyperlink: https://github.com/docling-project/docling
|
||||
label: text
|
||||
orig: repository
|
||||
parent:
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/12'
|
||||
text: repository
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: .
|
||||
parent:
|
||||
$ref: '#/groups/3'
|
||||
prov: []
|
||||
self_ref: '#/texts/13'
|
||||
text: .
|
||||
- children:
|
||||
- $ref: '#/groups/4'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: '-'
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/14'
|
||||
text: ''
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: Create your feature branch (
|
||||
parent:
|
||||
$ref: '#/groups/4'
|
||||
prov: []
|
||||
self_ref: '#/texts/15'
|
||||
text: Create your feature branch (
|
||||
- captions: []
|
||||
children: []
|
||||
code_language: unknown
|
||||
content_layer: body
|
||||
footnotes: []
|
||||
label: code
|
||||
orig: git checkout -b feature/AmazingFeature
|
||||
parent:
|
||||
$ref: '#/groups/4'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/16'
|
||||
text: git checkout -b feature/AmazingFeature
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: )
|
||||
parent:
|
||||
$ref: '#/groups/4'
|
||||
prov: []
|
||||
self_ref: '#/texts/17'
|
||||
text: )
|
||||
- children:
|
||||
- $ref: '#/groups/5'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: '-'
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/18'
|
||||
text: ''
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: Commit your changes (
|
||||
parent:
|
||||
$ref: '#/groups/5'
|
||||
prov: []
|
||||
self_ref: '#/texts/19'
|
||||
text: Commit your changes (
|
||||
- captions: []
|
||||
children: []
|
||||
code_language: unknown
|
||||
content_layer: body
|
||||
footnotes: []
|
||||
label: code
|
||||
orig: git commit -m 'Add some AmazingFeature'
|
||||
parent:
|
||||
$ref: '#/groups/5'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/20'
|
||||
text: git commit -m 'Add some AmazingFeature'
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: )
|
||||
parent:
|
||||
$ref: '#/groups/5'
|
||||
prov: []
|
||||
self_ref: '#/texts/21'
|
||||
text: )
|
||||
- children:
|
||||
- $ref: '#/groups/6'
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: '-'
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/22'
|
||||
text: ''
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: Push to the branch (
|
||||
parent:
|
||||
$ref: '#/groups/6'
|
||||
prov: []
|
||||
self_ref: '#/texts/23'
|
||||
text: Push to the branch (
|
||||
- captions: []
|
||||
children: []
|
||||
code_language: unknown
|
||||
content_layer: body
|
||||
footnotes: []
|
||||
label: code
|
||||
orig: git push origin feature/AmazingFeature
|
||||
parent:
|
||||
$ref: '#/groups/6'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/24'
|
||||
text: git push origin feature/AmazingFeature
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: )
|
||||
parent:
|
||||
$ref: '#/groups/6'
|
||||
prov: []
|
||||
self_ref: '#/texts/25'
|
||||
text: )
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: true
|
||||
label: list_item
|
||||
marker: '-'
|
||||
orig: Open a Pull Request
|
||||
parent:
|
||||
$ref: '#/groups/2'
|
||||
prov: []
|
||||
self_ref: '#/texts/26'
|
||||
text: Open a Pull Request
|
||||
- children:
|
||||
- $ref: '#/groups/7'
|
||||
content_layer: body
|
||||
label: section_header
|
||||
level: 1
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/27'
|
||||
text: ''
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: false
|
||||
italic: true
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: Second
|
||||
parent:
|
||||
$ref: '#/groups/7'
|
||||
prov: []
|
||||
self_ref: '#/texts/28'
|
||||
text: Second
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: section
|
||||
parent:
|
||||
$ref: '#/groups/7'
|
||||
prov: []
|
||||
self_ref: '#/texts/29'
|
||||
text: section
|
||||
- children:
|
||||
- $ref: '#/groups/9'
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: '-'
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/8'
|
||||
prov: []
|
||||
self_ref: '#/texts/30'
|
||||
text: ''
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: true
|
||||
italic: false
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: First
|
||||
parent:
|
||||
$ref: '#/groups/9'
|
||||
prov: []
|
||||
self_ref: '#/texts/31'
|
||||
text: First
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: ': Lorem ipsum.'
|
||||
parent:
|
||||
$ref: '#/groups/9'
|
||||
prov: []
|
||||
self_ref: '#/texts/32'
|
||||
text: ': Lorem ipsum.'
|
||||
- children:
|
||||
- $ref: '#/groups/10'
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: '-'
|
||||
orig: ''
|
||||
parent:
|
||||
$ref: '#/groups/8'
|
||||
prov: []
|
||||
self_ref: '#/texts/33'
|
||||
text: ''
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: true
|
||||
italic: false
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: Second
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
prov: []
|
||||
self_ref: '#/texts/34'
|
||||
text: Second
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: ': Dolor'
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
prov: []
|
||||
self_ref: '#/texts/35'
|
||||
text: ': Dolor'
|
||||
- captions: []
|
||||
children: []
|
||||
code_language: unknown
|
||||
content_layer: body
|
||||
footnotes: []
|
||||
label: code
|
||||
orig: sit
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/texts/36'
|
||||
text: sit
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: amet.
|
||||
parent:
|
||||
$ref: '#/groups/10'
|
||||
prov: []
|
||||
self_ref: '#/texts/37'
|
||||
text: amet.
|
||||
version: 1.3.0
|
@ -5,7 +5,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -1057,7 +1111,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -5032,7 +5091,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -5734,7 +5847,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -8374,7 +8492,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -9676,7 +9848,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -14401,7 +14578,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -15928,7 +16159,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -21385,7 +21621,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -21512,7 +21802,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -657,7 +711,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
@ -2982,7 +3041,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -3609,7 +3722,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
3
tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.itxt
vendored
Normal file
3
tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.itxt
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: chapter: group slide-0
|
||||
item-2 at level 2: title: X-Library The fully customisable ... llection exclusively for our customers
|
86
tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json
vendored
Normal file
86
tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json
vendored
Normal file
@ -0,0 +1,86 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "powerpoint_bad_text",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.ms-powerpoint",
|
||||
"binary_hash": 1443005848482130016,
|
||||
"filename": "powerpoint_bad_text.pptx"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "slide-0",
|
||||
"label": "chapter"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 1041400.0,
|
||||
"t": 4582390.0,
|
||||
"r": 8083550.0,
|
||||
"b": 1689099.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
118
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers",
|
||||
"text": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 12190413.0,
|
||||
"height": 6858000.0
|
||||
},
|
||||
"page_no": 1
|
||||
}
|
||||
}
|
||||
}
|
1
tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.md
vendored
Normal file
1
tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.md
vendored
Normal file
@ -0,0 +1 @@
|
||||
# X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers
|
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,61 @@
|
||||
"width": 612.0,
|
||||
"height": 792.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 612.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 612.0,
|
||||
"r_y2": 792.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 792.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 792.0,
|
||||
"r": 612.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -657,7 +711,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.2000122070312,
|
||||
"height": 841.9199829101562
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.2000122070312,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.2000122070312,
|
||||
"r_y2": 841.9199829101562,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9199829101562,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9199829101562,
|
||||
"r": 595.2000122070312,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -3982,7 +4036,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.3200073242188,
|
||||
"height": 842.0399780273438
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.3200073242188,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.3200073242188,
|
||||
"r_y2": 842.0399780273438,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 842.0399780273438,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 842.0399780273438,
|
||||
"r": 595.3200073242188,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -1382,7 +1436,12 @@
|
||||
"from_ocr": false
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
23
tests/data/groundtruth/docling_v2/test_03.asciidoc.md
vendored
Normal file
23
tests/data/groundtruth/docling_v2/test_03.asciidoc.md
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
:\_mod-docs-content-type: PROCEDURE :experimental:
|
||||
|
||||
# Renaming a bookmark
|
||||
|
||||
[id="renaming-a-bookmark\_{context}"]
|
||||
|
||||
You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
|
||||
|
||||
Renaming the bookmark does not rename the folder.
|
||||
|
||||
- Check that the side bar lists the bookmark under the new name.
|
||||
|
||||
Procedure . Right-click the bookmark in the side bar. . Select *Rename…*. +
|
||||
|
||||
<!-- image -->
|
||||
|
||||
In the *Name* field, enter the new name for the bookmark. +
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Click btn:[Rename]. .Verification
|
||||
|
||||
<!-- image -->
|
@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-16 at level 2: list_item: Italic bullet 1
|
||||
item-17 at level 2: list_item: Bold bullet 2
|
||||
item-18 at level 2: list_item: Underline bullet 3
|
||||
item-19 at level 2: inline: group group
|
||||
item-20 at level 3: list_item: Some
|
||||
item-21 at level 3: list_item: italic
|
||||
item-22 at level 3: list_item: bold
|
||||
item-23 at level 3: list_item: underline
|
||||
item-24 at level 2: list: group list
|
||||
item-25 at level 3: inline: group group
|
||||
item-26 at level 4: list_item: Nested
|
||||
item-27 at level 4: list_item: italic
|
||||
item-28 at level 4: list_item: bold
|
||||
item-29 at level 1: paragraph:
|
||||
item-19 at level 2: list_item:
|
||||
item-20 at level 3: inline: group group
|
||||
item-21 at level 4: text: Some
|
||||
item-22 at level 4: text: italic
|
||||
item-23 at level 4: text: bold
|
||||
item-24 at level 4: text: underline
|
||||
item-25 at level 2: list: group list
|
||||
item-26 at level 3: list_item:
|
||||
item-27 at level 4: inline: group group
|
||||
item-28 at level 5: text: Nested
|
||||
item-29 at level 5: text: italic
|
||||
item-30 at level 5: text: bold
|
||||
item-31 at level 1: paragraph:
|
@ -42,7 +42,7 @@
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
"$ref": "#/texts/25"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -98,7 +98,7 @@
|
||||
"$ref": "#/texts/15"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
@ -111,12 +111,9 @@
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/17"
|
||||
},
|
||||
@ -125,6 +122,9 @@
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/19"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -138,7 +138,7 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
"$ref": "#/texts/21"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -148,17 +148,17 @@
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/22"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/24"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -461,20 +461,18 @@
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Some",
|
||||
"text": "Some",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@ -485,18 +483,16 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"orig": "Some",
|
||||
"text": "Some",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
@ -505,67 +501,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Nested",
|
||||
"text": "Nested",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
@ -574,7 +510,59 @@
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@ -585,7 +573,43 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Nested",
|
||||
"text": "Nested",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/24",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
@ -594,12 +618,10 @@
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"self_ref": "#/texts/25",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
|
16
tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
vendored
Normal file
16
tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: Transcript
|
||||
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
|
||||
item-3 at level 1: picture
|
||||
item-4 at level 1: inline: group group
|
||||
item-5 at level 2: paragraph: This is test 1
|
||||
item-6 at level 2: paragraph: 0:08
|
||||
Correct, he is not.
|
||||
item-7 at level 1: paragraph:
|
||||
item-8 at level 1: picture
|
||||
item-9 at level 1: inline: group group
|
||||
item-10 at level 2: paragraph: This is test 2
|
||||
item-11 at level 2: paragraph: 0:16
|
||||
Yeah, exactly.
|
||||
item-12 at level 1: paragraph:
|
||||
item-13 at level 1: paragraph:
|
286
tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
vendored
Normal file
286
tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
vendored
Normal file
@ -0,0 +1,286 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "word_image_anchors",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"binary_hash": 2428692234257307633,
|
||||
"filename": "word_image_anchors.docx"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/pictures/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Transcript",
|
||||
"text": "Transcript",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "February 20, 2025, 8:32PM",
|
||||
"text": "February 20, 2025, 8:32PM",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is test 1",
|
||||
"text": "This is test 1",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "0:08\nCorrect, he is not.",
|
||||
"text": "0:08\nCorrect, he is not.",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is test 2",
|
||||
"text": "This is test 2",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "0:16\nYeah, exactly.",
|
||||
"text": "0:16\nYeah, exactly.",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
}
|
||||
],
|
||||
"pictures": [
|
||||
{
|
||||
"self_ref": "#/pictures/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"image": {
|
||||
"mimetype": "image/png",
|
||||
"dpi": 72,
|
||||
"size": {
|
||||
"width": 100.0,
|
||||
"height": 100.0
|
||||
},
|
||||
"uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAz0lEQVR4nO3bUW0CURRF0TukQvDSauBr0mACE1VBAzYQg5Lpdw0wO2EtA+cl+/6+GQAAAAAAAAAAAADe1DIR53X9mcNcdhnf5nm93Y8T8DElyzyuv/evlx/CMqeJOOz9AP4TJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiWp8+t/k8f6/bDrvPl28CAAAAAAAAAAAAAAAAzLv5A5bTEG2TIIlOAAAAAElFTkSuQmCC"
|
||||
},
|
||||
"annotations": []
|
||||
},
|
||||
{
|
||||
"self_ref": "#/pictures/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"image": {
|
||||
"mimetype": "image/png",
|
||||
"dpi": 72,
|
||||
"size": {
|
||||
"width": 100.0,
|
||||
"height": 100.0
|
||||
},
|
||||
"uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAJIElEQVR4nO2dbWxb1RnH/8+1c5O4bITEwJrRF1ZAI6gtL9oK29oxihAdTQOVoGhbKyS0MDWZJk1CQ+q0aR/4xLYvJNGabdK07MukrSUNaxEvg7aUlteuLUoHrUTbseylSRSgpLGd3Ac9596kSWzHvva1fXzv/UmW4jaxj5+/z73nPOec50/QnM3t5xdbUWOlZeBGgK8jNpYC3AxQHOAGEMXAXKN+mSgF5nGAxgAeBmiIyToH0GnDwklj0jqxq/fK/0BjCJrR2jn8ZcPCXSBaC9DtAC/39h3oDMBHwHzQMvD3ga74P6ERWgjS1jG8BjAeALgVQEuZ334QoAHA2t3fHX8dQRWktX0obpi1jzDjewSshgYwcIwIf7KSiT8M9DYPB0KQts7RlWDuANCuSw/NAAPoBVF3f1fjCZQRKq8QeBzgragqqA+Ep8olDJXj0kSm+XNi6kQVw8RdnEz+otSXspIK0rZ9eDuIngTQAH8wBuYd/T3xnqoSRIauERi/ZuYN8CFEtG8K1o9LMWT2XJBN20e+TwZ1gdmEnyFKssWde3qafuvpy3r5Ym0dI78B8BiCxc7+7qYfaCXIxvbRpZEa7gOwDsHkwFSKtj7b23iu4oLYs2z6M4BlCDZnAd5S7Gy/KEHu3z5yDxN2AVhUzOv4iE+JsfmZnqbnyy7Iph+O3kcWD2g8264UzAa17nm68W+F/DEV0TOeC8XIChPj3kJ6ChV4z3gpvEzl5FOA17u9pxhuR1PODTwUIzeLJFZ2zEokiDO0Dfpoyg3LnJh5L4gz6QvqPKMY1jmx804QSYcEcAbuJY85MSz+pq7WuGEc831uqtQQJS1Yq3MlJHP2EMnahmJ4ALOpYpkDI9d6hl9T6JVAYqnWiAq5ZKlNCDW1p3y0uKQLY1YqcX22lcesPUSWXUMxSkKDE9v8e4izM+R4adoToiBalWnjROYeonaHhJSULDFO6yFh76hsL0nvIfYmtpBykCHWlGFk9X8d0uqrbqjBj7YtQlODq3QbLAtIphgffcL44N+TeO1oEgfeSkJT2Eolrpo94orO/l/ZawuuvBjFYBhAXS2px9VxE2tWmdjWZmHvgQnsemECmkEq5sAvp/9hztdPNj7DZxAB8SsMfLc1hscfvQz1dXp93+bH3Ji98KTLLvRSEDGAO1abaH8wBp2QmNuLfmk9RM5n+BvDAL6y0sTa23RLzV2K/ax7iDosUzUcOprEmydS6udoBFixJIovLYng2msiMGuyX5YW1RNuXBHFwbd1utGr2D8xI4ik2MFlP7lUFBcnGK+8kZh5/uJh+2e5ibc/FMs6OpN7yjVXR6AZLaKBpOZVq9WZPp/w+vEkXjqcwOQUqoppDeyvkTpg6R+GzltIpeQQVBXhaOD0azntGgyYgQ//p2P3sTUw5By490ePK8u1X4zANDPf2D+6YOGtd+3BgF7wctHCkEP58BG3ttTgG7eZat6RKa0iYrwzqKMggGgRlQoJVGWX20xcvyyKDetqcfsqE7F6yiiGCPH7v45DV0SLqJSr0CCX6Jq776hVj3yQZKMMi/v2XFTDZX3h66JO7RD4kQvjjKMnU3j2lQm898EkdEe0kB7SDJ9yWYzwtVtMLGuOqEnkvoMJ3XtIs2FX1fEvEQNYujiCrZti+NVPPq9m8vpCcUOVOAoAREDzlRGVVpGRmJ5wQ9SpN4Vq49Cs5KJQZwI3LJcEYxRLFkcyDnsFyXFta4vh/OgF/Ou/mk0QiWLRmeJfVcbFeclF4blX7ecy+vrOxno0Xp5ZlSVfiOBba2rxx37NhsDMNe4WrKuEFw8nsHd/Qg13MxGJAC0r5qxea4OhyuL5kGPvpTD2cfZLsfQemUxqBVHKsGsU+o9TZyeRWCDjK72kvlazCTHzuAx7x+BDWlZEEVtgQ8PUFHAxodtghsZk2FuRUnal5tabanD557LfIkUM6UV6wcPSQ4bgMzbeWYcNa+vUWnsmZJR/bkizIa+ChqJS15ZYs2tpHsj+qju/eim5KMGXeYg8FpqHCOMTjBOn9BvLiBZRKTKMKuTrt5jq4RbpHYOnJ/H8oblzGD2g04ZUfEaAODs0pd+E0EG0iEr57Sl/zg/Tesbpc5P43V/G9UuZOIgWhl0LXcpv+5ePLzAGXp7Az57+RON1ETojWjhTVT4CwDcbHZIpVjfuMx9O4cjxJPa/mdR8HWRGA2crKfNBED0MjTj+fgqP/tSXc9bMiAbT+7LEJaDS7Qk6lqOBEsQp9zBY6UYFmMHpkhuzhldi2RBSGS7FfpYg1u4KtSYEl2I/J2eyqWPkH34+RaUj4lmyp7vp5unnc2aEYmZSkVYFGJoX8zmCiLOMY2YSUh7YiXlmQZzz0r1lakwI0Du/KlB6Eouou5wtCjSUHus0QezaG+SqkmZIIVBf/tWACE8V9B4h+ZMlxhkFEeXEc8nFy4e4QGKbzWQs60KIGGApz6UQrxlzYgtXgqi7P/MOz5sTdJh3LOT0lnN3w/2do3vDyqTemYk909X47YV+J+farbiRSRFgj9oUXIiSKpY5yCmIpIXFjcyzhgUUtrgzH5u9vHY3ONZwOz1pWTDZma+9nqsdcm0dI/tDhwTXHOjvbvpmvr/sav+PWMPZbmQheXLWiRlKIojt08dbbDufkDwsj7a49TZ0vUNOPJXEGi5M0+c0BdtciKdhQVsWxX1MrOFCUbLb5hXqZVjwHlLx6RNruPDylWYseW+hHoZCUZt67W8Brw9v9DPWq+uLcfkUQnNiv5kTzya07y4eT88hSMOY0R6I3BdRUj6rl2IInh8MkRSBuJFJZhM+hYj2yWfMNx3i6rVRQpQBFtGTPrJOGpP1jP6eeE+p3qCkR6ek4WKA5YflYCbuks9SSjGEsh2/tZ17xOaHXeV2Kg/1yYaEbGvgnr8byoxjqSTOMu06GMdkQTIQvbJvqlxCTFOxgCg3H7P2EfHP0GWDNwPHZK+tbO9caN27lGjxDbX9M8SyQbkElNsUYNA+n2HtLiQZ6EtB0syQLdxl10KX8tteV92WE8d8RM70yTGyfJZVAy0I5iHlt6XisxQZlrq2TlnbZrt4Jzc4JQrtqnhS+0uVm5IKR1JUh4akXIWqkGDhpJwDt4+B68tnvr6L5zB8YjIAAAAASUVORK5CYII="
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
13
tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
vendored
Normal file
13
tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
**Transcript**
|
||||
|
||||
February 20, 2025, 8:32PM
|
||||
|
||||
<!-- image -->
|
||||
|
||||
**This is test 1** 0:08
|
||||
Correct, he is not.
|
||||
|
||||
<!-- image -->
|
||||
|
||||
**This is test 2** 0:16
|
||||
Yeah, exactly.
|
18
tests/data/md/inline_and_formatting.md
vendored
Normal file
18
tests/data/md/inline_and_formatting.md
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
# Contribution guideline example
|
||||
|
||||
This is simple.
|
||||
|
||||
Foo *emphasis* **strong emphasis** ***both***.
|
||||
|
||||
Create your feature branch: `git checkout -b feature/AmazingFeature`.
|
||||
|
||||
1. Pull the [**repository**](https://github.com/docling-project/docling).
|
||||
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
||||
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
||||
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
||||
5. Open a Pull Request
|
||||
|
||||
## *Second* section <!-- inline groups in headings not yet supported by serializers -->
|
||||
|
||||
- **First**: Lorem ipsum.
|
||||
- **Second**: Dolor `sit` amet.
|
BIN
tests/data/pptx/powerpoint_bad_text.pptx
vendored
Normal file
BIN
tests/data/pptx/powerpoint_bad_text.pptx
vendored
Normal file
Binary file not shown.
@ -5,7 +5,77 @@
|
||||
"width": 2000.0,
|
||||
"height": 2829.0
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 2000.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 2000.0,
|
||||
"r_y2": 2829.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 2829.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [
|
||||
{
|
||||
"index": 0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 2000.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 2000.0,
|
||||
"r_y2": 2829.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 2829.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"uri": null
|
||||
}
|
||||
],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +152,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,7 +5,61 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
@ -82,7 +136,12 @@
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -2,7 +2,11 @@ import glob
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.asciidoc_backend import (
|
||||
DEFAULT_IMAGE_HEIGHT,
|
||||
DEFAULT_IMAGE_WIDTH,
|
||||
AsciiDocBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@ -18,6 +22,24 @@ def _get_backend(fname):
|
||||
return doc_backend
|
||||
|
||||
|
||||
def test_parse_picture():
|
||||
line = (
|
||||
"image::images/example1.png[Example Image, width=200, height=150, align=center]"
|
||||
)
|
||||
res = AsciiDocBackend._parse_picture(line)
|
||||
assert res
|
||||
assert res.get("width", 0) == "200"
|
||||
assert res.get("height", 0) == "150"
|
||||
assert res.get("uri", "") == "images/example1.png"
|
||||
|
||||
line = "image::renamed-bookmark.png[Renamed bookmark]"
|
||||
res = AsciiDocBackend._parse_picture(line)
|
||||
assert res
|
||||
assert "width" not in res
|
||||
assert "height" not in res
|
||||
assert res.get("uri", "") == "renamed-bookmark.png"
|
||||
|
||||
|
||||
def test_asciidocs_examples():
|
||||
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
|
||||
|
||||
|
@ -2,7 +2,7 @@ from pathlib import Path
|
||||
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docling.datamodel.document import DoclingDocument, InputDocument
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
|
||||
@ -11,12 +11,15 @@ def test_convert_valid():
|
||||
fmt = InputFormat.MD
|
||||
cls = MarkdownDocumentBackend
|
||||
|
||||
test_data_path = Path("tests") / "data"
|
||||
relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
|
||||
root_path = Path("tests") / "data"
|
||||
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
||||
assert len(relevant_paths) > 0
|
||||
|
||||
yaml_filter = ["inline_and_formatting"]
|
||||
|
||||
for in_path in relevant_paths:
|
||||
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
@ -33,9 +36,17 @@ def test_convert_valid():
|
||||
act_data = act_doc.export_to_markdown()
|
||||
|
||||
if GEN_TEST_DATA:
|
||||
with open(gt_path, mode="w", encoding="utf-8") as f:
|
||||
with open(md_gt_path, mode="w", encoding="utf-8") as f:
|
||||
f.write(f"{act_data}\n")
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
|
||||
act_doc.save_as_yaml(yaml_gt_path)
|
||||
else:
|
||||
with open(gt_path, encoding="utf-8") as f:
|
||||
with open(md_gt_path, encoding="utf-8") as f:
|
||||
exp_data = f.read().rstrip()
|
||||
assert exp_data == act_data
|
||||
assert act_data == exp_data
|
||||
|
||||
if in_path.stem in yaml_filter:
|
||||
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
||||
assert act_doc == exp_doc
|
||||
|
@ -9,6 +9,7 @@ from docling.datamodel.document import (
|
||||
DoclingDocument,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||
"export to md"
|
||||
f"export to markdown failed on {docx_path}"
|
||||
)
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||
"export to indented-text"
|
||||
f"export to indented-text failed on {docx_path}"
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
||||
"document document"
|
||||
f"DoclingDocument verification failed on {docx_path}"
|
||||
)
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
pred_text=pred_html,
|
||||
gtfile=str(gt_path) + ".html",
|
||||
generate=GENERATE,
|
||||
), "export to html"
|
||||
), f"export to html failed on {docx_path}"
|
||||
|
||||
|
||||
flaky_path = Path("tests/data/docx/textbox.docx")
|
||||
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
|
||||
@pytest.mark.xfail(strict=False)
|
||||
def test_textbox_conversion():
|
||||
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
|
||||
|
||||
|
||||
def test_text_after_image_anchors():
|
||||
"""
|
||||
Test to analyse whether text gets parsed after image anchors.
|
||||
"""
|
||||
|
||||
in_path = Path("tests/data/docx/word_image_anchors.docx")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
format=InputFormat.DOCX,
|
||||
backend=MsWordDocumentBackend,
|
||||
)
|
||||
backend = MsWordDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=in_path,
|
||||
)
|
||||
doc = backend.convert()
|
||||
|
||||
found_text_after_anchor_1 = found_text_after_anchor_2 = (
|
||||
found_text_after_anchor_3
|
||||
) = found_text_after_anchor_4 = False
|
||||
for item, _ in doc.iterate_items():
|
||||
if isinstance(item, TextItem):
|
||||
if item.text == "This is test 1":
|
||||
found_text_after_anchor_1 = True
|
||||
elif item.text == "0:08\nCorrect, he is not.":
|
||||
found_text_after_anchor_2 = True
|
||||
elif item.text == "This is test 2":
|
||||
found_text_after_anchor_3 = True
|
||||
elif item.text == "0:16\nYeah, exactly.":
|
||||
found_text_after_anchor_4 = True
|
||||
|
||||
assert (
|
||||
found_text_after_anchor_1
|
||||
and found_text_after_anchor_2
|
||||
and found_text_after_anchor_3
|
||||
and found_text_after_anchor_4
|
||||
)
|
||||
|
@ -60,3 +60,25 @@ def test_code_and_formula_conversion():
|
||||
gt = "a ^ { 2 } + 8 = 1 2"
|
||||
predicted = formula_blocks[0].text
|
||||
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
|
||||
|
||||
|
||||
def test_formula_conversion_with_page_range():
|
||||
pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
|
||||
converter = get_converter()
|
||||
|
||||
print(f"converting {pdf_path} with page range")
|
||||
|
||||
doc_result: ConversionResult = converter.convert(pdf_path, page_range=(2, 2))
|
||||
|
||||
results = doc_result.document.texts
|
||||
|
||||
formula_blocks = [
|
||||
el
|
||||
for el in results
|
||||
if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA
|
||||
]
|
||||
assert len(formula_blocks) == 1
|
||||
|
||||
gt = "a ^ { 2 } + 8 = 1 2"
|
||||
predicted = formula_blocks[0].text
|
||||
assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
|
||||
|
@ -57,14 +57,14 @@ def test_e2e_conversions():
|
||||
pdf_paths = get_pdf_paths()
|
||||
|
||||
engines: List[Tuple[OcrOptions, bool]] = [
|
||||
(EasyOcrOptions(), False),
|
||||
(TesseractOcrOptions(), True),
|
||||
(TesseractCliOcrOptions(), True),
|
||||
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||
(EasyOcrOptions(), False),
|
||||
(TesseractOcrOptions(force_full_page_ocr=True), True),
|
||||
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
|
||||
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||
]
|
||||
|
||||
# rapidocr is only available for Python >=3.6,<3.13
|
||||
|
Loading…
Reference in New Issue
Block a user