mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge branch 'main' into nli/layout_dfine
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
commit
7aa2be93d6
@ -64,7 +64,6 @@ ENV EASYOCR_MODULE_PATH=/tmp/easyocr-models
|
|||||||
COPY --chown=1000:1000 .actor/actor.sh .actor/actor.sh
|
COPY --chown=1000:1000 .actor/actor.sh .actor/actor.sh
|
||||||
COPY --chown=1000:1000 .actor/actor.json .actor/actor.json
|
COPY --chown=1000:1000 .actor/actor.json .actor/actor.json
|
||||||
COPY --chown=1000:1000 .actor/input_schema.json .actor/input_schema.json
|
COPY --chown=1000:1000 .actor/input_schema.json .actor/input_schema.json
|
||||||
COPY --chown=1000:1000 .actor/docling_processor.py .actor/docling_processor.py
|
|
||||||
RUN chmod +x .actor/actor.sh
|
RUN chmod +x .actor/actor.sh
|
||||||
|
|
||||||
# Copy the build files from builder
|
# Copy the build files from builder
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
[](https://apify.com/vancura/docling)
|
[](https://apify.com/vancura/docling)
|
||||||
|
|
||||||
This Actor (specification v1) wraps the [Docling project](https://ds4sd.github.io/docling/) to provide serverless document processing in the cloud. It can process complex documents (PDF, DOCX, images) and convert them into structured formats (Markdown, JSON, HTML, Text, or DocTags) with optional OCR support.
|
This Actor (specification v1) wraps the [Docling project](https://github.com/docling-project/docling) to provide serverless document processing in the cloud. It can process complex documents (PDF, DOCX, images) and convert them into structured formats (Markdown, JSON, HTML, Text, or DocTags) with optional OCR support.
|
||||||
|
|
||||||
## What are Actors?
|
## What are Actors?
|
||||||
|
|
||||||
@ -14,7 +14,7 @@ This Actor (specification v1) wraps the [Docling project](https://ds4sd.github.i
|
|||||||
2. [Usage](#usage)
|
2. [Usage](#usage)
|
||||||
3. [Input Parameters](#input-parameters)
|
3. [Input Parameters](#input-parameters)
|
||||||
4. [Output](#output)
|
4. [Output](#output)
|
||||||
5. [Performance & Resources](#performance--resources)
|
5. [Performance and Resources](#performance-and-resources)
|
||||||
6. [Troubleshooting](#troubleshooting)
|
6. [Troubleshooting](#troubleshooting)
|
||||||
7. [Local Development](#local-development)
|
7. [Local Development](#local-development)
|
||||||
8. [Architecture](#architecture)
|
8. [Architecture](#architecture)
|
||||||
@ -190,7 +190,7 @@ Access logs via:
|
|||||||
apify key-value-stores get-record DOCLING_LOG
|
apify key-value-stores get-record DOCLING_LOG
|
||||||
```
|
```
|
||||||
|
|
||||||
## Performance & Resources
|
## Performance and Resources
|
||||||
|
|
||||||
- **Docker Image Size**: ~4GB
|
- **Docker Image Size**: ~4GB
|
||||||
- **Memory Requirements**:
|
- **Memory Requirements**:
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
{
|
{
|
||||||
"actorSpecification": 1,
|
"actorSpecification": 1,
|
||||||
"name": "docling",
|
"name": "docling",
|
||||||
"version": "0.0",
|
"version": "1.0",
|
||||||
"environmentVariables": {},
|
"environmentVariables": {},
|
||||||
"dockerFile": "./Dockerfile",
|
"dockerFile": "./Dockerfile",
|
||||||
"input": "./input_schema.json",
|
"inputSchema": "./input_schema.json",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"run": "./actor.sh"
|
"run": "./actor.sh"
|
||||||
}
|
}
|
||||||
|
@ -154,17 +154,6 @@ else
|
|||||||
echo "Warning: No build files directory found. Some tools may be unavailable."
|
echo "Warning: No build files directory found. Some tools may be unavailable."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Copy Python processor script to tools directory
|
|
||||||
PYTHON_SCRIPT_PATH="$(dirname "$0")/docling_processor.py"
|
|
||||||
if [ -f "$PYTHON_SCRIPT_PATH" ]; then
|
|
||||||
echo "Copying Python processor script to tools directory..."
|
|
||||||
cp "$PYTHON_SCRIPT_PATH" "$TOOLS_DIR/"
|
|
||||||
chmod +x "$TOOLS_DIR/docling_processor.py"
|
|
||||||
else
|
|
||||||
echo "ERROR: Python processor script not found at $PYTHON_SCRIPT_PATH"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check OCR directories and ensure they're writable
|
# Check OCR directories and ensure they're writable
|
||||||
echo "Checking OCR directory permissions..."
|
echo "Checking OCR directory permissions..."
|
||||||
OCR_DIR="/opt/app-root/src/.EasyOCR"
|
OCR_DIR="/opt/app-root/src/.EasyOCR"
|
||||||
|
3
.gitattributes
vendored
Normal file
3
.gitattributes
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
tests/data/** linguist-vendored
|
||||||
|
tests/data_scanned/** linguist-vendored
|
||||||
|
docs/** linguist-vendored
|
67
CHANGELOG.md
67
CHANGELOG.md
@ -1,3 +1,70 @@
|
|||||||
|
## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* **ocr:** Auto-detect rotated pages in Tesseract ([#1167](https://github.com/docling-project/docling/issues/1167)) ([`45265bf`](https://github.com/docling-project/docling/commit/45265bf8b1a6d6ad5367bb3f17fb3fa9d4366a05))
|
||||||
|
* Establish confidence estimation for document and pages ([#1313](https://github.com/docling-project/docling/issues/1313)) ([`9087524`](https://github.com/docling-project/docling/commit/90875247e5813da1de17f3cd4475937e8bd45571))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Fix ZeroDivisionError for cell_bbox.area() ([#1636](https://github.com/docling-project/docling/issues/1636)) ([`c2f595d`](https://github.com/docling-project/docling/commit/c2f595d2830ca2e28e68c5da606e89541264f156))
|
||||||
|
* **integration:** Update the Apify Actor integration ([#1619](https://github.com/docling-project/docling/issues/1619)) ([`14d4f5b`](https://github.com/docling-project/docling/commit/14d4f5b109fa65d777ab147b3ce9b5174d020a5d))
|
||||||
|
|
||||||
|
## [v2.33.0](https://github.com/docling-project/docling/releases/tag/v2.33.0) - 2025-05-20
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Add textbox content extraction in msword_backend ([#1538](https://github.com/docling-project/docling/issues/1538)) ([`12a0e64`](https://github.com/docling-project/docling/commit/12a0e648929ce75da73617904792a50f5145fe4a))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Fix issue with detecting docx files, and files with upper case extensions ([#1609](https://github.com/docling-project/docling/issues/1609)) ([`f4d9d41`](https://github.com/docling-project/docling/commit/f4d9d4111b0a6eb87fc1c05a56618fc430d1e7a2))
|
||||||
|
* Load_from_doctags static usage ([#1617](https://github.com/docling-project/docling/issues/1617)) ([`0e00a26`](https://github.com/docling-project/docling/commit/0e00a263fa0c45f6cf2ae0bd94f9387c28e51ed0))
|
||||||
|
* Incorrect force_backend_text behaviour for VLM DocTag pipelines ([#1371](https://github.com/docling-project/docling/issues/1371)) ([`f2e9c07`](https://github.com/docling-project/docling/commit/f2e9c0784c842612641171754ce51362e298088d))
|
||||||
|
* **pypdfium:** Resolve overlapping text when merging bounding boxes ([#1549](https://github.com/docling-project/docling/issues/1549)) ([`98b5eeb`](https://github.com/docling-project/docling/commit/98b5eeb8440d34ac84f58271c8b8eea88881260a))
|
||||||
|
|
||||||
|
## [v2.32.0](https://github.com/docling-project/docling/releases/tag/v2.32.0) - 2025-05-14
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Improve parallelization for remote services API calls ([#1548](https://github.com/docling-project/docling/issues/1548)) ([`3a04f2a`](https://github.com/docling-project/docling/commit/3a04f2a367e32913f91faa2325f928b85112e632))
|
||||||
|
* Support image/webp file type ([#1415](https://github.com/docling-project/docling/issues/1415)) ([`12dab0a`](https://github.com/docling-project/docling/commit/12dab0a1e8d181d99e4711ffdbbc33d158234fb4))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* **ocr:** Orig field in TesseractOcrCliModel as str ([#1553](https://github.com/docling-project/docling/issues/1553)) ([`9f8b479`](https://github.com/docling-project/docling/commit/9f8b479f17bbfaf79c3c897980ad15742ec86568))
|
||||||
|
* **settings:** Fix nested settings load via environment variables ([#1551](https://github.com/docling-project/docling/issues/1551)) ([`2efb7a7`](https://github.com/docling-project/docling/commit/2efb7a7c06a8e51516cc9b93e5dbcdea69f562fa))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Add advanced chunking & serialization example ([#1589](https://github.com/docling-project/docling/issues/1589)) ([`9f28abf`](https://github.com/docling-project/docling/commit/9f28abf0610560645b40352dfdfc3525fa86c28d))
|
||||||
|
|
||||||
|
## [v2.31.2](https://github.com/docling-project/docling/releases/tag/v2.31.2) - 2025-05-13
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* AsciiDoc header identification (#1562) ([#1563](https://github.com/docling-project/docling/issues/1563)) ([`4046d0b`](https://github.com/docling-project/docling/commit/4046d0b2f38254679de5fc78aaf2fe630d6bb61c))
|
||||||
|
* Restrict click version and update lock file ([#1582](https://github.com/docling-project/docling/issues/1582)) ([`8baa85a`](https://github.com/docling-project/docling/commit/8baa85a49d3a456d198c52aac8e0b4ac70c92e72))
|
||||||
|
|
||||||
|
## [v2.31.1](https://github.com/docling-project/docling/releases/tag/v2.31.1) - 2025-05-12
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Add smoldocling in download utils ([#1577](https://github.com/docling-project/docling/issues/1577)) ([`127e386`](https://github.com/docling-project/docling/commit/127e38646fd7f23fcda0e392e756fe27f123bd78))
|
||||||
|
* **HTML:** Handle row spans in header rows ([#1536](https://github.com/docling-project/docling/issues/1536)) ([`776e7ec`](https://github.com/docling-project/docling/commit/776e7ecf9ac93d62c66b03f33e5c8560e81b6fb3))
|
||||||
|
* Mime error in document streams ([#1523](https://github.com/docling-project/docling/issues/1523)) ([`f1658ed`](https://github.com/docling-project/docling/commit/f1658edbad5c7205bb457322d2c89f7f4d8a4659))
|
||||||
|
* Usage of hashlib for FIPS ([#1512](https://github.com/docling-project/docling/issues/1512)) ([`7c70573`](https://github.com/docling-project/docling/commit/7c705739f9db1cfc6c0a502fd5ba8b2093376d7f))
|
||||||
|
* Guard against attribute errors in TesseractOcrModel __del__ ([#1494](https://github.com/docling-project/docling/issues/1494)) ([`4ab7e9d`](https://github.com/docling-project/docling/commit/4ab7e9ddfb9d8fd0abc483efb70e701447a602c5))
|
||||||
|
* Enable cuda_use_flash_attention2 for PictureDescriptionVlmModel ([#1496](https://github.com/docling-project/docling/issues/1496)) ([`cc45396`](https://github.com/docling-project/docling/commit/cc453961a9196c79f6428305b9007402e448f300))
|
||||||
|
* Updated the time-recorder label for reading order ([#1490](https://github.com/docling-project/docling/issues/1490)) ([`976e92e`](https://github.com/docling-project/docling/commit/976e92e289a414b6b70c3e3ca37a60c85fa12535))
|
||||||
|
* Incorrect scaling of TableModel bboxes when do_cell_matching is False ([#1459](https://github.com/docling-project/docling/issues/1459)) ([`94d66a0`](https://github.com/docling-project/docling/commit/94d66a076559c4e48017bd619508cfeef104079b))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Update links in data_prep_kit ([#1559](https://github.com/docling-project/docling/issues/1559)) ([`844babb`](https://github.com/docling-project/docling/commit/844babb39034b39d9c4edcc3f145684991cda174))
|
||||||
|
* Add serialization docs, update chunking docs ([#1556](https://github.com/docling-project/docling/issues/1556)) ([`3220a59`](https://github.com/docling-project/docling/commit/3220a592e720174940a3b958555f90352d7320d8))
|
||||||
|
* Update supported formats guide ([#1463](https://github.com/docling-project/docling/issues/1463)) ([`3afbe6c`](https://github.com/docling-project/docling/commit/3afbe6c9695d52cf6ed8b48b2f403df7d53342e5))
|
||||||
|
|
||||||
## [v2.31.0](https://github.com/docling-project/docling/releases/tag/v2.31.0) - 2025-04-25
|
## [v2.31.0](https://github.com/docling-project/docling/releases/tag/v2.31.0) - 2025-04-25
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -287,7 +287,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# ========= Section headers
|
# ========= Section headers
|
||||||
def _is_section_header(self, line):
|
def _is_section_header(self, line):
|
||||||
return re.match(r"^==+", line)
|
return re.match(r"^==+\s+", line)
|
||||||
|
|
||||||
def _parse_section_header(self, line):
|
def _parse_section_header(self, line):
|
||||||
match = re.match(r"^(=+)\s+(.*)", line)
|
match = re.match(r"^(=+)\s+(.*)", line)
|
||||||
|
@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
).to_top_left_origin(page_height=page_size.height * scale)
|
).to_top_left_origin(page_height=page_size.height * scale)
|
||||||
|
|
||||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
if overlap_frac > 0.5:
|
||||||
if len(text_piece) > 0:
|
if len(text_piece) > 0:
|
||||||
|
@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
).to_top_left_origin(page_height=page_size.height * scale)
|
).to_top_left_origin(page_height=page_size.height * scale)
|
||||||
|
|
||||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
if overlap_frac > 0.5:
|
||||||
if len(text_piece) > 0:
|
if len(text_piece) > 0:
|
||||||
|
@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|||||||
.scaled(scale)
|
.scaled(scale)
|
||||||
)
|
)
|
||||||
|
|
||||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
if overlap_frac > 0.5:
|
||||||
if len(text_piece) > 0:
|
if len(text_piece) > 0:
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Final, Optional, Union, cast
|
from typing import Final, Optional, Union, cast
|
||||||
@ -137,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.analyze_tag(cast(Tag, element), doc)
|
self.analyze_tag(cast(Tag, element), doc)
|
||||||
except Exception as exc_child:
|
except Exception as exc_child:
|
||||||
_log.error(
|
_log.error(
|
||||||
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
raise exc_child
|
raise exc_child
|
||||||
elif isinstance(element, NavigableString) and not isinstance(
|
elif isinstance(element, NavigableString) and not isinstance(
|
||||||
@ -390,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug(f"list-item has no text: {element}")
|
_log.debug(f"list-item has no text: {element}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_table_data(element: Tag) -> Optional[TableData]:
|
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
||||||
nested_tables = element.find("table")
|
nested_tables = element.find("table")
|
||||||
if nested_tables is not None:
|
if nested_tables is not None:
|
||||||
_log.debug("Skipping nested table.")
|
_log.debug("Skipping nested table.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Count the number of rows (number of <tr> elements)
|
# Find the number of rows and columns (taking into account spans)
|
||||||
num_rows = len(element("tr"))
|
num_rows = 0
|
||||||
|
|
||||||
# Find the number of columns (taking into account colspan)
|
|
||||||
num_cols = 0
|
num_cols = 0
|
||||||
for row in element("tr"):
|
for row in element("tr"):
|
||||||
col_count = 0
|
col_count = 0
|
||||||
|
is_row_header = True
|
||||||
if not isinstance(row, Tag):
|
if not isinstance(row, Tag):
|
||||||
continue
|
continue
|
||||||
for cell in row(["td", "th"]):
|
for cell in row(["td", "th"]):
|
||||||
if not isinstance(row, Tag):
|
if not isinstance(row, Tag):
|
||||||
continue
|
continue
|
||||||
val = cast(Tag, cell).get("colspan", "1")
|
cell_tag = cast(Tag, cell)
|
||||||
|
val = cell_tag.get("colspan", "1")
|
||||||
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
||||||
col_count += colspan
|
col_count += colspan
|
||||||
|
if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
|
||||||
|
is_row_header = False
|
||||||
num_cols = max(num_cols, col_count)
|
num_cols = max(num_cols, col_count)
|
||||||
|
if not is_row_header:
|
||||||
|
num_rows += 1
|
||||||
|
|
||||||
|
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
||||||
|
|
||||||
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||||
|
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||||
|
|
||||||
# Iterate over the rows in the table
|
# Iterate over the rows in the table
|
||||||
for row_idx, row in enumerate(element("tr")):
|
start_row_span = 0
|
||||||
|
row_idx = -1
|
||||||
|
for row in element("tr"):
|
||||||
if not isinstance(row, Tag):
|
if not isinstance(row, Tag):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# For each row, find all the column cells (both <td> and <th>)
|
# For each row, find all the column cells (both <td> and <th>)
|
||||||
cells = row(["td", "th"])
|
cells = row(["td", "th"])
|
||||||
|
|
||||||
# Check if each cell in the row is a header -> means it is a column header
|
# Check if cell is in a column header or row header
|
||||||
col_header = True
|
col_header = True
|
||||||
|
row_header = True
|
||||||
for html_cell in cells:
|
for html_cell in cells:
|
||||||
if isinstance(html_cell, Tag) and html_cell.name == "td":
|
if isinstance(html_cell, Tag):
|
||||||
|
if html_cell.name == "td":
|
||||||
col_header = False
|
col_header = False
|
||||||
|
row_header = False
|
||||||
|
elif html_cell.get("rowspan") is None:
|
||||||
|
row_header = False
|
||||||
|
if not row_header:
|
||||||
|
row_idx += 1
|
||||||
|
start_row_span = 0
|
||||||
|
else:
|
||||||
|
start_row_span += 1
|
||||||
|
|
||||||
# Extract the text content of each cell
|
# Extract the text content of each cell
|
||||||
col_idx = 0
|
col_idx = 0
|
||||||
@ -460,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if isinstance(row_val, str) and row_val.isnumeric()
|
if isinstance(row_val, str) and row_val.isnumeric()
|
||||||
else 1
|
else 1
|
||||||
)
|
)
|
||||||
|
if row_header:
|
||||||
while grid[row_idx][col_idx] is not None:
|
row_span -= 1
|
||||||
|
while (
|
||||||
|
col_idx < num_cols
|
||||||
|
and grid[row_idx + start_row_span][col_idx] is not None
|
||||||
|
):
|
||||||
col_idx += 1
|
col_idx += 1
|
||||||
for r in range(row_span):
|
for r in range(start_row_span, start_row_span + row_span):
|
||||||
for c in range(col_span):
|
for c in range(col_span):
|
||||||
|
if row_idx + r < num_rows and col_idx + c < num_cols:
|
||||||
grid[row_idx + r][col_idx + c] = text
|
grid[row_idx + r][col_idx + c] = text
|
||||||
|
|
||||||
table_cell = TableCell(
|
table_cell = TableCell(
|
||||||
text=text,
|
text=text,
|
||||||
row_span=row_span,
|
row_span=row_span,
|
||||||
col_span=col_span,
|
col_span=col_span,
|
||||||
start_row_offset_idx=row_idx,
|
start_row_offset_idx=start_row_span + row_idx,
|
||||||
end_row_offset_idx=row_idx + row_span,
|
end_row_offset_idx=start_row_span + row_idx + row_span,
|
||||||
start_col_offset_idx=col_idx,
|
start_col_offset_idx=col_idx,
|
||||||
end_col_offset_idx=col_idx + col_span,
|
end_col_offset_idx=col_idx + col_span,
|
||||||
column_header=col_header,
|
column_header=col_header,
|
||||||
|
@ -2,7 +2,7 @@ import logging
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, List, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
|
|||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
from docx.text.run import Run
|
from docx.text.run import Run
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml.etree import XPath
|
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
from pydantic import AnyUrl
|
from pydantic import AnyUrl
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents: dict[int, Optional[NodeItem]] = {}
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
||||||
self.numbered_headers: dict[int, int] = {}
|
self.numbered_headers: dict[int, int] = {}
|
||||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||||
|
# Track processed textbox elements to avoid duplication
|
||||||
|
self.processed_textbox_elements: List[int] = []
|
||||||
|
# Track content hash of processed paragraphs to avoid duplicate content
|
||||||
|
self.processed_paragraph_content: List[str] = []
|
||||||
|
|
||||||
for i in range(-1, self.max_levels):
|
for i in range(-1, self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||||
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
||||||
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||||||
|
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
||||||
|
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
|
||||||
|
"v": "urn:schemas-microsoft-com:vml",
|
||||||
|
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
|
||||||
|
"w10": "urn:schemas-microsoft-com:office:word",
|
||||||
|
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
|
||||||
}
|
}
|
||||||
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
||||||
drawing_blip = xpath_expr(element)
|
drawing_blip = xpath_expr(element)
|
||||||
|
|
||||||
|
# Check for textbox content - check multiple textbox formats
|
||||||
|
# Only process if the element hasn't been processed before
|
||||||
|
element_id = id(element)
|
||||||
|
if element_id not in self.processed_textbox_elements:
|
||||||
|
# Modern Word textboxes
|
||||||
|
txbx_xpath = etree.XPath(
|
||||||
|
".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
|
||||||
|
)
|
||||||
|
textbox_elements = txbx_xpath(element)
|
||||||
|
|
||||||
|
# No modern textboxes found, check for alternate/legacy textbox formats
|
||||||
|
if not textbox_elements and tag_name in ["drawing", "pict"]:
|
||||||
|
# Additional checks for textboxes in DrawingML and VML formats
|
||||||
|
alt_txbx_xpath = etree.XPath(
|
||||||
|
".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
|
||||||
|
namespaces=namespaces,
|
||||||
|
)
|
||||||
|
textbox_elements = alt_txbx_xpath(element)
|
||||||
|
|
||||||
|
# Check for shape text that's not in a standard textbox
|
||||||
|
if not textbox_elements:
|
||||||
|
shape_text_xpath = etree.XPath(
|
||||||
|
".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
|
||||||
|
namespaces=namespaces,
|
||||||
|
)
|
||||||
|
shape_text_elements = shape_text_xpath(element)
|
||||||
|
if shape_text_elements:
|
||||||
|
# Create custom text elements from shape text
|
||||||
|
text_content = " ".join(
|
||||||
|
[t.text for t in shape_text_elements if t.text]
|
||||||
|
)
|
||||||
|
if text_content.strip():
|
||||||
|
_log.debug(f"Found shape text: {text_content[:50]}...")
|
||||||
|
# Create a paragraph-like element to process with standard handler
|
||||||
|
level = self._get_level()
|
||||||
|
shape_group = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
name="shape-text",
|
||||||
|
)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=shape_group,
|
||||||
|
text=text_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
if textbox_elements:
|
||||||
|
# Mark the parent element as processed
|
||||||
|
self.processed_textbox_elements.append(element_id)
|
||||||
|
# Also mark all found textbox elements as processed
|
||||||
|
for tb_element in textbox_elements:
|
||||||
|
self.processed_textbox_elements.append(id(tb_element))
|
||||||
|
|
||||||
|
_log.debug(
|
||||||
|
f"Found textbox content with {len(textbox_elements)} elements"
|
||||||
|
)
|
||||||
|
self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
||||||
|
|
||||||
# Check for Tables
|
# Check for Tables
|
||||||
if element.tag.endswith("tbl"):
|
if element.tag.endswith("tbl"):
|
||||||
try:
|
try:
|
||||||
@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
||||||
has_any_formatting = run.bold or run.italic or run.underline
|
# The .bold and .italic properties are booleans, but .underline can be an enum
|
||||||
return (
|
# like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
|
||||||
Formatting(
|
has_bold = run.bold or False
|
||||||
bold=run.bold or False,
|
has_italic = run.italic or False
|
||||||
italic=run.italic or False,
|
# Convert any non-None underline value to True
|
||||||
underline=run.underline or False,
|
has_underline = bool(run.underline is not None and run.underline)
|
||||||
)
|
|
||||||
if has_any_formatting
|
return Formatting(
|
||||||
else None
|
bold=has_bold,
|
||||||
|
italic=has_italic,
|
||||||
|
underline=has_underline,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_paragraph_elements(self, paragraph: Paragraph):
|
def _get_paragraph_elements(self, paragraph: Paragraph):
|
||||||
@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return paragraph_elements
|
return paragraph_elements
|
||||||
|
|
||||||
|
def _get_paragraph_position(self, paragraph_element):
|
||||||
|
"""Extract vertical position information from paragraph element."""
|
||||||
|
# First try to directly get the index from w:p element that has an order-related attribute
|
||||||
|
if (
|
||||||
|
hasattr(paragraph_element, "getparent")
|
||||||
|
and paragraph_element.getparent() is not None
|
||||||
|
):
|
||||||
|
parent = paragraph_element.getparent()
|
||||||
|
# Get all paragraph siblings
|
||||||
|
paragraphs = [
|
||||||
|
p for p in parent.getchildren() if etree.QName(p).localname == "p"
|
||||||
|
]
|
||||||
|
# Find index of current paragraph within its siblings
|
||||||
|
try:
|
||||||
|
paragraph_index = paragraphs.index(paragraph_element)
|
||||||
|
return paragraph_index # Use index as position for consistent ordering
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Look for position hints in element attributes and ancestor elements
|
||||||
|
for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
|
||||||
|
# Check for direct position attributes
|
||||||
|
for attr_name in ["y", "top", "positionY", "y-position", "position"]:
|
||||||
|
value = elem.get(attr_name)
|
||||||
|
if value:
|
||||||
|
try:
|
||||||
|
# Remove any non-numeric characters (like 'pt', 'px', etc.)
|
||||||
|
clean_value = re.sub(r"[^0-9.]", "", value)
|
||||||
|
if clean_value:
|
||||||
|
return float(clean_value)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for position in transform attribute
|
||||||
|
transform = elem.get("transform")
|
||||||
|
if transform:
|
||||||
|
# Extract translation component from transform matrix
|
||||||
|
match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
return float(match.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for anchors or relative position indicators in Word format
|
||||||
|
# 'dist' attributes can indicate relative positioning
|
||||||
|
for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
|
||||||
|
if elem.get(attr_name) is not None:
|
||||||
|
return elem.sourceline # Use the XML source line number as fallback
|
||||||
|
|
||||||
|
# For VML shapes, look for specific attributes
|
||||||
|
for ns_uri in paragraph_element.nsmap.values():
|
||||||
|
if "vml" in ns_uri:
|
||||||
|
# Try to extract position from style attribute
|
||||||
|
style = paragraph_element.get("style")
|
||||||
|
if style:
|
||||||
|
match = re.search(r"top:([0-9.]+)pt", style)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
return float(match.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If no better position indicator found, use XML source line number as proxy for order
|
||||||
|
return (
|
||||||
|
paragraph_element.sourceline
|
||||||
|
if hasattr(paragraph_element, "sourceline")
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
def _collect_textbox_paragraphs(self, textbox_elements):
|
||||||
|
"""Collect and organize paragraphs from textbox elements."""
|
||||||
|
processed_paragraphs = []
|
||||||
|
container_paragraphs = {}
|
||||||
|
|
||||||
|
for element in textbox_elements:
|
||||||
|
element_id = id(element)
|
||||||
|
# Skip if we've already processed this exact element
|
||||||
|
if element_id in processed_paragraphs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag_name = etree.QName(element).localname
|
||||||
|
processed_paragraphs.append(element_id)
|
||||||
|
|
||||||
|
# Handle paragraphs directly found (VML textboxes)
|
||||||
|
if tag_name == "p":
|
||||||
|
# Find the containing textbox or shape element
|
||||||
|
container_id = None
|
||||||
|
for ancestor in element.iterancestors():
|
||||||
|
if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
|
||||||
|
container_id = id(ancestor)
|
||||||
|
break
|
||||||
|
|
||||||
|
if container_id not in container_paragraphs:
|
||||||
|
container_paragraphs[container_id] = []
|
||||||
|
container_paragraphs[container_id].append(
|
||||||
|
(element, self._get_paragraph_position(element))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle txbxContent elements (Word DrawingML textboxes)
|
||||||
|
elif tag_name == "txbxContent":
|
||||||
|
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
||||||
|
container_id = id(element)
|
||||||
|
if container_id not in container_paragraphs:
|
||||||
|
container_paragraphs[container_id] = []
|
||||||
|
|
||||||
|
for p in paragraphs:
|
||||||
|
p_id = id(p)
|
||||||
|
if p_id not in processed_paragraphs:
|
||||||
|
processed_paragraphs.append(p_id)
|
||||||
|
container_paragraphs[container_id].append(
|
||||||
|
(p, self._get_paragraph_position(p))
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Try to extract any paragraphs from unknown elements
|
||||||
|
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
||||||
|
container_id = id(element)
|
||||||
|
if container_id not in container_paragraphs:
|
||||||
|
container_paragraphs[container_id] = []
|
||||||
|
|
||||||
|
for p in paragraphs:
|
||||||
|
p_id = id(p)
|
||||||
|
if p_id not in processed_paragraphs:
|
||||||
|
processed_paragraphs.append(p_id)
|
||||||
|
container_paragraphs[container_id].append(
|
||||||
|
(p, self._get_paragraph_position(p))
|
||||||
|
)
|
||||||
|
|
||||||
|
return container_paragraphs
|
||||||
|
|
||||||
|
def _handle_textbox_content(
|
||||||
|
self,
|
||||||
|
textbox_elements: list,
|
||||||
|
docx_obj: DocxDocument,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
) -> None:
|
||||||
|
"""Process textbox content and add it to the document structure."""
|
||||||
|
level = self._get_level()
|
||||||
|
# Create a textbox group to contain all text from the textbox
|
||||||
|
textbox_group = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set this as the current parent to ensure textbox content
|
||||||
|
# is properly nested in document structure
|
||||||
|
original_parent = self.parents[level]
|
||||||
|
self.parents[level] = textbox_group
|
||||||
|
|
||||||
|
# Collect and organize paragraphs
|
||||||
|
container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
|
||||||
|
|
||||||
|
# Process all paragraphs
|
||||||
|
all_paragraphs = []
|
||||||
|
|
||||||
|
# Sort paragraphs within each container, then process containers
|
||||||
|
for container_id, paragraphs in container_paragraphs.items():
|
||||||
|
# Sort by vertical position within each container
|
||||||
|
sorted_container_paragraphs = sorted(
|
||||||
|
paragraphs,
|
||||||
|
key=lambda x: (
|
||||||
|
x[1] is None,
|
||||||
|
x[1] if x[1] is not None else float("inf"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add the sorted paragraphs to our processing list
|
||||||
|
all_paragraphs.extend(sorted_container_paragraphs)
|
||||||
|
|
||||||
|
# Process all the paragraphs
|
||||||
|
for p, _ in all_paragraphs:
|
||||||
|
self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
|
||||||
|
|
||||||
|
# Restore original parent
|
||||||
|
self.parents[level] = original_parent
|
||||||
|
return
|
||||||
|
|
||||||
def _handle_equations_in_text(self, element, text):
|
def _handle_equations_in_text(self, element, text):
|
||||||
only_texts = []
|
only_texts = []
|
||||||
only_equations = []
|
only_equations = []
|
||||||
@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
element: BaseOxmlElement,
|
element: BaseOxmlElement,
|
||||||
docx_obj: DocxDocument,
|
docx_obj: DocxDocument,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
|
is_from_textbox: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
paragraph = Paragraph(element, docx_obj)
|
paragraph = Paragraph(element, docx_obj)
|
||||||
|
|
||||||
|
# Skip if from a textbox and this exact paragraph content was already processed
|
||||||
|
# Skip if from a textbox and this exact paragraph content was already processed
|
||||||
raw_text = paragraph.text
|
raw_text = paragraph.text
|
||||||
|
if is_from_textbox and raw_text:
|
||||||
|
# Create a simple hash of content to detect duplicates
|
||||||
|
content_hash = f"{len(raw_text)}:{raw_text[:50]}"
|
||||||
|
if content_hash in self.processed_paragraph_content:
|
||||||
|
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
|
||||||
|
return
|
||||||
|
self.processed_paragraph_content.append(content_hash)
|
||||||
|
|
||||||
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
||||||
|
|
||||||
if text is None:
|
if text is None:
|
||||||
|
@ -175,13 +175,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
if len(group) == 1:
|
if len(group) == 1:
|
||||||
return group[0]
|
return group[0]
|
||||||
|
|
||||||
merged_text = "".join(cell.text for cell in group)
|
|
||||||
merged_bbox = BoundingBox(
|
merged_bbox = BoundingBox(
|
||||||
l=min(cell.rect.to_bounding_box().l for cell in group),
|
l=min(cell.rect.to_bounding_box().l for cell in group),
|
||||||
t=min(cell.rect.to_bounding_box().t for cell in group),
|
t=min(cell.rect.to_bounding_box().t for cell in group),
|
||||||
r=max(cell.rect.to_bounding_box().r for cell in group),
|
r=max(cell.rect.to_bounding_box().r for cell in group),
|
||||||
b=max(cell.rect.to_bounding_box().b for cell in group),
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
assert self._ppage is not None
|
||||||
|
self.text_page = self._ppage.get_textpage()
|
||||||
|
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
||||||
|
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||||
|
|
||||||
return TextCell(
|
return TextCell(
|
||||||
index=group[0].index,
|
index=group[0].index,
|
||||||
text=merged_text,
|
text=merged_text,
|
||||||
|
@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
|
|||||||
|
|
||||||
import rich.table
|
import rich.table
|
||||||
import typer
|
import typer
|
||||||
|
from docling_core.transforms.serializer.html import (
|
||||||
|
HTMLDocSerializer,
|
||||||
|
HTMLOutputStyle,
|
||||||
|
HTMLParams,
|
||||||
|
)
|
||||||
|
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
from docling_core.utils.file import resolve_source_to_path
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
@ -156,6 +162,7 @@ def export_documents(
|
|||||||
export_json: bool,
|
export_json: bool,
|
||||||
export_html: bool,
|
export_html: bool,
|
||||||
export_html_split_page: bool,
|
export_html_split_page: bool,
|
||||||
|
show_layout: bool,
|
||||||
export_md: bool,
|
export_md: bool,
|
||||||
export_txt: bool,
|
export_txt: bool,
|
||||||
export_doctags: bool,
|
export_doctags: bool,
|
||||||
@ -189,8 +196,26 @@ def export_documents(
|
|||||||
if export_html_split_page:
|
if export_html_split_page:
|
||||||
fname = output_dir / f"{doc_filename}.html"
|
fname = output_dir / f"{doc_filename}.html"
|
||||||
_log.info(f"writing HTML output to {fname}")
|
_log.info(f"writing HTML output to {fname}")
|
||||||
|
if show_layout:
|
||||||
|
ser = HTMLDocSerializer(
|
||||||
|
doc=conv_res.document,
|
||||||
|
params=HTMLParams(
|
||||||
|
image_mode=image_export_mode,
|
||||||
|
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
visualizer = LayoutVisualizer()
|
||||||
|
visualizer.params.show_label = False
|
||||||
|
ser_res = ser.serialize(
|
||||||
|
visualizer=visualizer,
|
||||||
|
)
|
||||||
|
with open(fname, "w") as fw:
|
||||||
|
fw.write(ser_res.text)
|
||||||
|
else:
|
||||||
conv_res.document.save_as_html(
|
conv_res.document.save_as_html(
|
||||||
filename=fname, image_mode=image_export_mode, split_page_view=True
|
filename=fname,
|
||||||
|
image_mode=image_export_mode,
|
||||||
|
split_page_view=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Export Text format:
|
# Export Text format:
|
||||||
@ -250,6 +275,13 @@ def convert( # noqa: C901
|
|||||||
to_formats: List[OutputFormat] = typer.Option(
|
to_formats: List[OutputFormat] = typer.Option(
|
||||||
None, "--to", help="Specify output formats. Defaults to Markdown."
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
||||||
),
|
),
|
||||||
|
show_layout: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help="If enabled, the page images will show the bounding-boxes of the items.",
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
headers: str = typer.Option(
|
headers: str = typer.Option(
|
||||||
None,
|
None,
|
||||||
"--headers",
|
"--headers",
|
||||||
@ -596,6 +628,7 @@ def convert( # noqa: C901
|
|||||||
export_json=export_json,
|
export_json=export_json,
|
||||||
export_html=export_html,
|
export_html=export_html,
|
||||||
export_html_split_page=export_html_split_page,
|
export_html_split_page=export_html_split_page,
|
||||||
|
show_layout=show_layout,
|
||||||
export_md=export_md,
|
export_md=export_md,
|
||||||
export_txt=export_txt,
|
export_txt=export_txt,
|
||||||
export_doctags=export_doctags,
|
export_doctags=export_doctags,
|
||||||
|
@ -32,6 +32,8 @@ class _AvailableModels(str, Enum):
|
|||||||
CODE_FORMULA = "code_formula"
|
CODE_FORMULA = "code_formula"
|
||||||
PICTURE_CLASSIFIER = "picture_classifier"
|
PICTURE_CLASSIFIER = "picture_classifier"
|
||||||
SMOLVLM = "smolvlm"
|
SMOLVLM = "smolvlm"
|
||||||
|
SMOLDOCLING = "smoldocling"
|
||||||
|
SMOLDOCLING_MLX = "smoldocling_mlx"
|
||||||
GRANITE_VISION = "granite_vision"
|
GRANITE_VISION = "granite_vision"
|
||||||
EASYOCR = "easyocr"
|
EASYOCR = "easyocr"
|
||||||
|
|
||||||
@ -105,6 +107,8 @@ def download(
|
|||||||
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
||||||
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
||||||
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
||||||
|
with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
|
||||||
|
with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
|
||||||
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
||||||
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
||||||
)
|
)
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
|
import math
|
||||||
|
from collections import defaultdict
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -16,7 +19,7 @@ from docling_core.types.io import (
|
|||||||
DocumentStream,
|
DocumentStream,
|
||||||
)
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.backend.pdf_backend import PdfPageBackend
|
from docling.backend.pdf_backend import PdfPageBackend
|
||||||
@ -90,6 +93,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
"image/tiff",
|
"image/tiff",
|
||||||
"image/gif",
|
"image/gif",
|
||||||
"image/bmp",
|
"image/bmp",
|
||||||
|
"image/webp",
|
||||||
],
|
],
|
||||||
InputFormat.PDF: ["application/pdf"],
|
InputFormat.PDF: ["application/pdf"],
|
||||||
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
||||||
@ -297,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
|
|||||||
choices: List[OpenAiResponseChoice]
|
choices: List[OpenAiResponseChoice]
|
||||||
created: int
|
created: int
|
||||||
usage: OpenAiResponseUsage
|
usage: OpenAiResponseUsage
|
||||||
|
|
||||||
|
|
||||||
|
# Create a type alias for score values
|
||||||
|
ScoreValue = float
|
||||||
|
|
||||||
|
|
||||||
|
class QualityGrade(str, Enum):
|
||||||
|
POOR = "poor"
|
||||||
|
FAIR = "fair"
|
||||||
|
GOOD = "good"
|
||||||
|
EXCELLENT = "excellent"
|
||||||
|
UNSPECIFIED = "unspecified"
|
||||||
|
|
||||||
|
|
||||||
|
class PageConfidenceScores(BaseModel):
|
||||||
|
parse_score: ScoreValue = np.nan
|
||||||
|
layout_score: ScoreValue = np.nan
|
||||||
|
table_score: ScoreValue = np.nan
|
||||||
|
ocr_score: ScoreValue = np.nan
|
||||||
|
|
||||||
|
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
||||||
|
if score < 0.5:
|
||||||
|
return QualityGrade.POOR
|
||||||
|
elif score < 0.8:
|
||||||
|
return QualityGrade.FAIR
|
||||||
|
elif score < 0.9:
|
||||||
|
return QualityGrade.GOOD
|
||||||
|
elif score >= 0.9:
|
||||||
|
return QualityGrade.EXCELLENT
|
||||||
|
|
||||||
|
return QualityGrade.UNSPECIFIED
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def mean_grade(self) -> QualityGrade:
|
||||||
|
return self._score_to_grade(self.mean_score)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def low_grade(self) -> QualityGrade:
|
||||||
|
return self._score_to_grade(self.low_score)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def mean_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanmean(
|
||||||
|
[
|
||||||
|
self.ocr_score,
|
||||||
|
self.table_score,
|
||||||
|
self.layout_score,
|
||||||
|
self.parse_score,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def low_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanquantile(
|
||||||
|
[
|
||||||
|
self.ocr_score,
|
||||||
|
self.table_score,
|
||||||
|
self.layout_score,
|
||||||
|
self.parse_score,
|
||||||
|
],
|
||||||
|
q=0.05,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ConfidenceReport(PageConfidenceScores):
|
||||||
|
pages: Dict[int, PageConfidenceScores] = Field(
|
||||||
|
default_factory=lambda: defaultdict(PageConfidenceScores)
|
||||||
|
)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def mean_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanmean(
|
||||||
|
[c.mean_score for c in self.pages.values()],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@computed_field # type: ignore
|
||||||
|
@property
|
||||||
|
def low_score(self) -> ScoreValue:
|
||||||
|
return ScoreValue(
|
||||||
|
np.nanmean(
|
||||||
|
[c.low_score for c in self.pages.values()],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
|
|||||||
)
|
)
|
||||||
from docling_core.utils.file import resolve_source_to_stream
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
from docling_core.utils.legacy import docling_document_to_legacy
|
from docling_core.utils.legacy import docling_document_to_legacy
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, Field
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docling.backend.abstract_backend import (
|
||||||
@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
|
ConfidenceReport,
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
|
|||||||
pages: List[Page] = []
|
pages: List[Page] = []
|
||||||
assembled: AssembledUnit = AssembledUnit()
|
assembled: AssembledUnit = AssembledUnit()
|
||||||
timings: Dict[str, ProfilingItem] = {}
|
timings: Dict[str, ProfilingItem] = {}
|
||||||
|
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
||||||
|
|
||||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||||
|
|
||||||
@ -302,7 +304,15 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
if ("." in obj.name and not obj.name.startswith("."))
|
if ("." in obj.name and not obj.name.startswith("."))
|
||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
mime = _DocumentConversionInput._mime_from_extension(ext)
|
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
|
||||||
|
if mime is not None and mime.lower() == "application/zip":
|
||||||
|
objname = obj.name.lower()
|
||||||
|
if objname.endswith(".xlsx"):
|
||||||
|
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
elif objname.endswith(".docx"):
|
||||||
|
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
elif objname.endswith(".pptx"):
|
||||||
|
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
|
|
||||||
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
||||||
mime = mime or _DocumentConversionInput._detect_csv(content)
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
||||||
@ -324,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
) -> Optional[InputFormat]:
|
) -> Optional[InputFormat]:
|
||||||
"""Guess the input format of a document by checking part of its content."""
|
"""Guess the input format of a document by checking part of its content."""
|
||||||
input_format: Optional[InputFormat] = None
|
input_format: Optional[InputFormat] = None
|
||||||
content_str = content.decode("utf-8")
|
|
||||||
|
|
||||||
if mime == "application/xml":
|
if mime == "application/xml":
|
||||||
|
content_str = content.decode("utf-8")
|
||||||
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
||||||
if match_doctype:
|
if match_doctype:
|
||||||
xml_doctype = match_doctype.group()
|
xml_doctype = match_doctype.group()
|
||||||
@ -348,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
input_format = InputFormat.XML_JATS
|
input_format = InputFormat.XML_JATS
|
||||||
|
|
||||||
elif mime == "text/plain":
|
elif mime == "text/plain":
|
||||||
|
content_str = content.decode("utf-8")
|
||||||
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
||||||
input_format = InputFormat.XML_USPTO
|
input_format = InputFormat.XML_USPTO
|
||||||
|
|
||||||
@ -368,6 +379,13 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
||||||
elif ext in FormatToExtensions[InputFormat.PDF]:
|
elif ext in FormatToExtensions[InputFormat.PDF]:
|
||||||
mime = FormatToMimeType[InputFormat.PDF][0]
|
mime = FormatToMimeType[InputFormat.PDF][0]
|
||||||
|
elif ext in FormatToExtensions[InputFormat.DOCX]:
|
||||||
|
mime = FormatToMimeType[InputFormat.DOCX][0]
|
||||||
|
elif ext in FormatToExtensions[InputFormat.PPTX]:
|
||||||
|
mime = FormatToMimeType[InputFormat.PPTX][0]
|
||||||
|
elif ext in FormatToExtensions[InputFormat.XLSX]:
|
||||||
|
mime = FormatToMimeType[InputFormat.XLSX][0]
|
||||||
|
|
||||||
return mime
|
return mime
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -225,6 +225,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|||||||
headers: Dict[str, str] = {}
|
headers: Dict[str, str] = {}
|
||||||
params: Dict[str, Any] = {}
|
params: Dict[str, Any] = {}
|
||||||
timeout: float = 20
|
timeout: float = 20
|
||||||
|
concurrency: int = 1
|
||||||
|
|
||||||
prompt: str = "Describe this image in a few sentences."
|
prompt: str = "Describe this image in a few sentences."
|
||||||
provenance: str = ""
|
provenance: str = ""
|
||||||
@ -295,6 +296,7 @@ class ApiVlmOptions(BaseVlmOptions):
|
|||||||
params: Dict[str, Any] = {}
|
params: Dict[str, Any] = {}
|
||||||
scale: float = 2.0
|
scale: float = 2.0
|
||||||
timeout: float = 60
|
timeout: float = 60
|
||||||
|
concurrency: int = 1
|
||||||
response_format: ResponseFormat
|
response_format: ResponseFormat
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,13 +56,15 @@ class DebugSettings(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class AppSettings(BaseSettings):
|
class AppSettings(BaseSettings):
|
||||||
model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
|
model_config = SettingsConfigDict(
|
||||||
|
env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
|
||||||
|
)
|
||||||
|
|
||||||
perf: BatchConcurrencySettings
|
perf: BatchConcurrencySettings = BatchConcurrencySettings()
|
||||||
debug: DebugSettings
|
debug: DebugSettings = DebugSettings()
|
||||||
|
|
||||||
cache_dir: Path = Path.home() / ".cache" / "docling"
|
cache_dir: Path = Path.home() / ".cache" / "docling"
|
||||||
artifacts_path: Optional[Path] = None
|
artifacts_path: Optional[Path] = None
|
||||||
|
|
||||||
|
|
||||||
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
settings = AppSettings()
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -27,6 +28,7 @@ class ApiVlmModel(BasePageModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.timeout = self.vlm_options.timeout
|
self.timeout = self.vlm_options.timeout
|
||||||
|
self.concurrency = self.vlm_options.concurrency
|
||||||
self.prompt_content = (
|
self.prompt_content = (
|
||||||
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
||||||
)
|
)
|
||||||
@ -38,10 +40,10 @@ class ApiVlmModel(BasePageModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
for page in page_batch:
|
def _vlm_request(page):
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
return page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "vlm"):
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
@ -63,4 +65,7 @@ class ApiVlmModel(BasePageModel):
|
|||||||
|
|
||||||
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
||||||
|
|
||||||
yield page
|
return page
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
||||||
|
yield from executor.map(_vlm_request, page_batch)
|
||||||
|
@ -5,6 +5,7 @@ from collections.abc import Iterable
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import DocItemLabel
|
from docling_core.types.doc import DocItemLabel
|
||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -186,6 +187,24 @@ class LayoutModel(BasePageModel):
|
|||||||
).postprocess()
|
).postprocess()
|
||||||
# processed_clusters, processed_cells = clusters, page.cells
|
# processed_clusters, processed_cells = clusters, page.cells
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore",
|
||||||
|
"Mean of empty slice|invalid value encountered in scalar divide",
|
||||||
|
RuntimeWarning,
|
||||||
|
"numpy",
|
||||||
|
)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].layout_score = float(
|
||||||
|
np.mean([c.confidence for c in processed_clusters])
|
||||||
|
)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
||||||
|
np.mean(
|
||||||
|
[c.confidence for c in processed_cells if c.from_ocr]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
page.cells = processed_cells
|
page.cells = processed_cells
|
||||||
page.predictions.layout = LayoutPrediction(
|
page.predictions.layout = LayoutPrediction(
|
||||||
clusters=processed_clusters
|
clusters=processed_clusters
|
||||||
|
@ -3,6 +3,7 @@ import re
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
|
import re
|
||||||
|
import warnings
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
@ -21,6 +24,14 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
def __init__(self, options: PagePreprocessingOptions):
|
def __init__(self, options: PagePreprocessingOptions):
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
|
# Pre-compiled regex patterns for efficiency
|
||||||
|
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
||||||
|
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
||||||
|
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
||||||
|
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
||||||
|
r"(?:/\w+\s*){2,}"
|
||||||
|
) # Two or more "/token " sequences
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
@ -60,6 +71,22 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
if self.options.create_parsed_page:
|
if self.options.create_parsed_page:
|
||||||
page.parsed_page = page._backend.get_segmented_page()
|
page.parsed_page = page._backend.get_segmented_page()
|
||||||
|
|
||||||
|
# Rate the text quality from the PDF parser, and aggregate on page
|
||||||
|
text_scores = []
|
||||||
|
for c in page.cells:
|
||||||
|
score = self.rate_text_quality(c.text)
|
||||||
|
text_scores.append(score)
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
|
||||||
|
)
|
||||||
|
conv_res.confidence.pages[page.page_no].parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
text_scores, q=0.10
|
||||||
|
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
||||||
|
)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
draw = ImageDraw.Draw(image)
|
draw = ImageDraw.Draw(image)
|
||||||
@ -88,3 +115,30 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def rate_text_quality(self, text: str) -> float:
|
||||||
|
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
||||||
|
blacklist_chars = ["<EFBFBD>"]
|
||||||
|
if (
|
||||||
|
any(text.find(c) >= 0 for c in blacklist_chars)
|
||||||
|
or self.GLYPH_RE.search(text)
|
||||||
|
or self.SLASH_G_RE.search(text)
|
||||||
|
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
||||||
|
text
|
||||||
|
) # Check if text is mostly slash-number pattern
|
||||||
|
):
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
penalty = 0.0
|
||||||
|
|
||||||
|
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
||||||
|
frag_matches = self.FRAG_RE.findall(text)
|
||||||
|
if len(frag_matches) >= 3:
|
||||||
|
penalty += 0.1 * len(frag_matches)
|
||||||
|
|
||||||
|
# Additional heuristic: if the average token length is below 2, add a penalty.
|
||||||
|
# tokens = text.split()
|
||||||
|
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
||||||
|
# penalty += 0.2
|
||||||
|
|
||||||
|
return max(1.0 - penalty, 0.0)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Type, Union
|
from typing import Optional, Type, Union
|
||||||
|
|
||||||
@ -37,6 +38,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
accelerator_options=accelerator_options,
|
accelerator_options=accelerator_options,
|
||||||
)
|
)
|
||||||
self.options: PictureDescriptionApiOptions
|
self.options: PictureDescriptionApiOptions
|
||||||
|
self.concurrency = self.options.concurrency
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
if not enable_remote_services:
|
if not enable_remote_services:
|
||||||
@ -48,8 +50,8 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||||
# Note: technically we could make a batch request here,
|
# Note: technically we could make a batch request here,
|
||||||
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||||
for image in images:
|
def _api_request(image):
|
||||||
yield api_image_request(
|
return api_image_request(
|
||||||
image=image,
|
image=image,
|
||||||
prompt=self.options.prompt,
|
prompt=self.options.prompt,
|
||||||
url=self.options.url,
|
url=self.options.url,
|
||||||
@ -57,3 +59,6 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
headers=self.options.headers,
|
headers=self.options.headers,
|
||||||
**self.options.params,
|
**self.options.params,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
||||||
|
yield from executor.map(_api_request, images)
|
||||||
|
@ -2,6 +2,7 @@ import csv
|
|||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.ocr_utils import map_tesseract_script
|
from docling.utils.ocr_utils import (
|
||||||
|
map_tesseract_script,
|
||||||
|
parse_tesseract_orientation,
|
||||||
|
tesseract_box_to_bounding_rectangle,
|
||||||
|
)
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
self._version: Optional[str] = None
|
self._version: Optional[str] = None
|
||||||
self._tesseract_languages: Optional[List[str]] = None
|
self._tesseract_languages: Optional[List[str]] = None
|
||||||
self._script_prefix: Optional[str] = None
|
self._script_prefix: Optional[str] = None
|
||||||
|
self._is_auto: bool = "auto" in self.options.lang
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
try:
|
try:
|
||||||
@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
|
|
||||||
return name, version
|
return name, version
|
||||||
|
|
||||||
def _run_tesseract(self, ifilename: str):
|
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
|
||||||
r"""
|
r"""
|
||||||
Run tesseract CLI
|
Run tesseract CLI
|
||||||
"""
|
"""
|
||||||
cmd = [self.options.tesseract_cmd]
|
cmd = [self.options.tesseract_cmd]
|
||||||
|
if self._is_auto:
|
||||||
if "auto" in self.options.lang:
|
lang = self._parse_language(osd)
|
||||||
lang = self._detect_language(ifilename)
|
|
||||||
if lang is not None:
|
if lang is not None:
|
||||||
cmd.append("-l")
|
cmd.append("-l")
|
||||||
cmd.append(lang)
|
cmd.append(lang)
|
||||||
@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
cmd += [ifilename, "stdout", "tsv"]
|
cmd += [ifilename, "stdout", "tsv"]
|
||||||
_log.info("command: {}".format(" ".join(cmd)))
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
|
|
||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
||||||
output, _ = proc.communicate()
|
|
||||||
|
|
||||||
# _log.info(output)
|
# _log.info(output)
|
||||||
|
|
||||||
# Decode the byte string to a regular string
|
# Decode the byte string to a regular string
|
||||||
decoded_data = output.decode("utf-8")
|
decoded_data = output.stdout.decode("utf-8")
|
||||||
# _log.info(decoded_data)
|
# _log.info(decoded_data)
|
||||||
|
|
||||||
# Read the TSV file generated by Tesseract
|
# Read the TSV file generated by Tesseract
|
||||||
@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
|
|
||||||
return df_filtered
|
return df_filtered
|
||||||
|
|
||||||
def _detect_language(self, ifilename: str):
|
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
|
||||||
r"""
|
r"""
|
||||||
Run tesseract in PSM 0 mode to detect the language
|
Run tesseract in PSM 0 mode to detect the language
|
||||||
"""
|
"""
|
||||||
assert self._tesseract_languages is not None
|
|
||||||
|
|
||||||
cmd = [self.options.tesseract_cmd]
|
cmd = [self.options.tesseract_cmd]
|
||||||
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
||||||
_log.info("command: {}".format(" ".join(cmd)))
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
output = subprocess.run(cmd, capture_output=True, check=True)
|
||||||
output, _ = proc.communicate()
|
decoded_data = output.stdout.decode("utf-8")
|
||||||
decoded_data = output.decode("utf-8")
|
|
||||||
df_detected = pd.read_csv(
|
df_detected = pd.read_csv(
|
||||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||||
)
|
)
|
||||||
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
return df_detected
|
||||||
|
|
||||||
|
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
|
||||||
|
assert self._tesseract_languages is not None
|
||||||
|
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
|
||||||
if len(scripts) == 0:
|
if len(scripts) == 0:
|
||||||
_log.warning("Tesseract cannot detect the script of the page")
|
_log.warning("Tesseract cannot detect the script of the page")
|
||||||
return None
|
return None
|
||||||
@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
cmd = [self.options.tesseract_cmd]
|
cmd = [self.options.tesseract_cmd]
|
||||||
cmd.append("--list-langs")
|
cmd.append("--list-langs")
|
||||||
_log.info("command: {}".format(" ".join(cmd)))
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
||||||
output, _ = proc.communicate()
|
decoded_data = output.stdout.decode("utf-8")
|
||||||
decoded_data = output.decode("utf-8")
|
|
||||||
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||||
self._tesseract_languages = df_list[0].tolist()[1:]
|
self._tesseract_languages = df_list[0].tolist()[1:]
|
||||||
|
|
||||||
@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
|
|
||||||
for page in page_batch:
|
for page_i, page in enumerate(page_batch):
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
ocr_rects = self.get_ocr_rects(page)
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
for ocr_rect in ocr_rects:
|
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
||||||
# Skip zero area boxes
|
# Skip zero area boxes
|
||||||
if ocr_rect.area() == 0:
|
if ocr_rect.area() == 0:
|
||||||
continue
|
continue
|
||||||
@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
) as image_file:
|
) as image_file:
|
||||||
fname = image_file.name
|
fname = image_file.name
|
||||||
high_res_image.save(image_file)
|
high_res_image.save(image_file)
|
||||||
|
doc_orientation = 0
|
||||||
df_result = self._run_tesseract(fname)
|
try:
|
||||||
|
df_osd = self._perform_osd(fname)
|
||||||
|
doc_orientation = _parse_orientation(df_osd)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
_log.error(
|
||||||
|
"OSD failed (doc %s, page: %s, "
|
||||||
|
"OCR rectangle: %s, processed image file %s):\n %s",
|
||||||
|
conv_res.input.file,
|
||||||
|
page_i,
|
||||||
|
ocr_rect_i,
|
||||||
|
image_file,
|
||||||
|
exc.stderr,
|
||||||
|
)
|
||||||
|
# Skipping if OSD fail when in auto mode, otherwise proceed
|
||||||
|
# to OCR in the hope OCR will succeed while OSD failed
|
||||||
|
if self._is_auto:
|
||||||
|
continue
|
||||||
|
if doc_orientation != 0:
|
||||||
|
high_res_image = high_res_image.rotate(
|
||||||
|
-doc_orientation, expand=True
|
||||||
|
)
|
||||||
|
high_res_image.save(fname)
|
||||||
|
try:
|
||||||
|
df_result = self._run_tesseract(fname, df_osd)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
_log.error(
|
||||||
|
"tesseract OCR failed (doc %s, page: %s, "
|
||||||
|
"OCR rectangle: %s, processed image file %s):\n %s",
|
||||||
|
conv_res.input.file,
|
||||||
|
page_i,
|
||||||
|
ocr_rect_i,
|
||||||
|
image_file,
|
||||||
|
exc.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(fname):
|
if os.path.exists(fname):
|
||||||
os.remove(fname)
|
os.remove(fname)
|
||||||
@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
text = row["text"]
|
text = row["text"]
|
||||||
conf = row["conf"]
|
conf = row["conf"]
|
||||||
|
|
||||||
l = float(row["left"]) # noqa: E741
|
left, top = float(row["left"]), float(row["top"])
|
||||||
b = float(row["top"])
|
right = left + float(row["width"])
|
||||||
w = float(row["width"])
|
bottom = top + row["height"]
|
||||||
h = float(row["height"])
|
bbox = BoundingBox(
|
||||||
|
l=left,
|
||||||
t = b + h
|
t=top,
|
||||||
r = l + w
|
r=right,
|
||||||
|
b=bottom,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
rect = tesseract_box_to_bounding_rectangle(
|
||||||
|
bbox,
|
||||||
|
original_offset=ocr_rect,
|
||||||
|
scale=self.scale,
|
||||||
|
orientation=doc_orientation,
|
||||||
|
im_size=high_res_image.size,
|
||||||
|
)
|
||||||
cell = TextCell(
|
cell = TextCell(
|
||||||
index=ix,
|
index=ix,
|
||||||
text=str(text),
|
text=str(text),
|
||||||
orig=text,
|
orig=str(text),
|
||||||
from_ocr=True,
|
from_ocr=True,
|
||||||
confidence=conf / 100.0,
|
confidence=conf / 100.0,
|
||||||
rect=BoundingRectangle.from_bounding_box(
|
rect=rect,
|
||||||
BoundingBox.from_tuple(
|
|
||||||
coord=(
|
|
||||||
(l / self.scale) + ocr_rect.l,
|
|
||||||
(b / self.scale) + ocr_rect.t,
|
|
||||||
(r / self.scale) + ocr_rect.l,
|
|
||||||
(t / self.scale) + ocr_rect.t,
|
|
||||||
),
|
|
||||||
origin=CoordOrigin.TOPLEFT,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
all_ocr_cells.append(cell)
|
all_ocr_cells.append(cell)
|
||||||
|
|
||||||
@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def get_options_type(cls) -> Type[OcrOptions]:
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
return TesseractCliOcrOptions
|
return TesseractCliOcrOptions
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
||||||
|
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
|
||||||
|
orientation = parse_tesseract_orientation(orientations[0].strip())
|
||||||
|
return orientation
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from collections.abc import Iterable
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Type
|
from typing import Iterable, Optional, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.ocr_utils import map_tesseract_script
|
from docling.utils.ocr_utils import (
|
||||||
|
map_tesseract_script,
|
||||||
|
parse_tesseract_orientation,
|
||||||
|
tesseract_box_to_bounding_rectangle,
|
||||||
|
)
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
accelerator_options=accelerator_options,
|
accelerator_options=accelerator_options,
|
||||||
)
|
)
|
||||||
self.options: TesseractOcrOptions
|
self.options: TesseractOcrOptions
|
||||||
|
self._is_auto: bool = "auto" in self.options.lang
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
self.reader = None
|
self.reader = None
|
||||||
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
||||||
@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
|
|
||||||
if lang == "auto":
|
if lang == "auto":
|
||||||
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
||||||
self.osd_reader = tesserocr.PyTessBaseAPI(
|
|
||||||
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
self.reader = tesserocr.PyTessBaseAPI(
|
self.reader = tesserocr.PyTessBaseAPI(
|
||||||
**{"lang": lang} | tesserocr_kwargs,
|
**{"lang": lang} | tesserocr_kwargs,
|
||||||
)
|
)
|
||||||
|
self.osd_reader = tesserocr.PyTessBaseAPI(
|
||||||
|
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
||||||
|
)
|
||||||
self.reader_RIL = tesserocr.RIL
|
self.reader_RIL = tesserocr.RIL
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
|
|
||||||
for page in page_batch:
|
for page_i, page in enumerate(page_batch):
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "ocr"):
|
with TimeRecorder(conv_res, "ocr"):
|
||||||
assert self.reader is not None
|
assert self.reader is not None
|
||||||
|
assert self.osd_reader is not None
|
||||||
assert self._tesserocr_languages is not None
|
assert self._tesserocr_languages is not None
|
||||||
|
|
||||||
ocr_rects = self.get_ocr_rects(page)
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
for ocr_rect in ocr_rects:
|
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
||||||
# Skip zero area boxes
|
# Skip zero area boxes
|
||||||
if ocr_rect.area() == 0:
|
if ocr_rect.area() == 0:
|
||||||
continue
|
continue
|
||||||
@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
local_reader = self.reader
|
local_reader = self.reader
|
||||||
if "auto" in self.options.lang:
|
|
||||||
assert self.osd_reader is not None
|
|
||||||
|
|
||||||
self.osd_reader.SetImage(high_res_image)
|
self.osd_reader.SetImage(high_res_image)
|
||||||
osd = self.osd_reader.DetectOrientationScript()
|
osd = self.osd_reader.DetectOrientationScript()
|
||||||
|
# No text, or Orientation and Script detection failure
|
||||||
# No text, probably
|
|
||||||
if osd is None:
|
if osd is None:
|
||||||
|
_log.error(
|
||||||
|
"OSD failed for doc (doc %s, page: %s, "
|
||||||
|
"OCR rectangle: %s)",
|
||||||
|
conv_res.input.file,
|
||||||
|
page_i,
|
||||||
|
ocr_rect_i,
|
||||||
|
)
|
||||||
|
# Skipping if OSD fail when in auto mode, otherwise proceed
|
||||||
|
# to OCR in the hope OCR will succeed while OSD failed
|
||||||
|
if self._is_auto:
|
||||||
continue
|
continue
|
||||||
|
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
|
||||||
|
if doc_orientation != 0:
|
||||||
|
high_res_image = high_res_image.rotate(
|
||||||
|
-doc_orientation, expand=True
|
||||||
|
)
|
||||||
|
if self._is_auto:
|
||||||
script = osd["script_name"]
|
script = osd["script_name"]
|
||||||
script = map_tesseract_script(script)
|
script = map_tesseract_script(script)
|
||||||
lang = f"{self.script_prefix}{script}"
|
lang = f"{self.script_prefix}{script}"
|
||||||
@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
# Extract text within the bounding box
|
# Extract text within the bounding box
|
||||||
text = local_reader.GetUTF8Text().strip()
|
text = local_reader.GetUTF8Text().strip()
|
||||||
confidence = local_reader.MeanTextConf()
|
confidence = local_reader.MeanTextConf()
|
||||||
left = box["x"] / self.scale
|
left, top = box["x"], box["y"]
|
||||||
bottom = box["y"] / self.scale
|
right = left + box["w"]
|
||||||
right = (box["x"] + box["w"]) / self.scale
|
bottom = top + box["h"]
|
||||||
top = (box["y"] + box["h"]) / self.scale
|
bbox = BoundingBox(
|
||||||
|
l=left,
|
||||||
|
t=top,
|
||||||
|
r=right,
|
||||||
|
b=bottom,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
rect = tesseract_box_to_bounding_rectangle(
|
||||||
|
bbox,
|
||||||
|
original_offset=ocr_rect,
|
||||||
|
scale=self.scale,
|
||||||
|
orientation=doc_orientation,
|
||||||
|
im_size=high_res_image.size,
|
||||||
|
)
|
||||||
cells.append(
|
cells.append(
|
||||||
TextCell(
|
TextCell(
|
||||||
index=ix,
|
index=ix,
|
||||||
@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
orig=text,
|
orig=text,
|
||||||
from_ocr=True,
|
from_ocr=True,
|
||||||
confidence=confidence,
|
confidence=confidence,
|
||||||
rect=BoundingRectangle.from_bounding_box(
|
rect=rect,
|
||||||
BoundingBox.from_tuple(
|
|
||||||
coord=(left, top, right, bottom),
|
|
||||||
origin=CoordOrigin.TOPLEFT,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ import warnings
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, cast
|
from typing import Optional, cast
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
@ -54,13 +55,15 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
self.keep_images = (
|
self.keep_images = (
|
||||||
self.pipeline_options.generate_page_images
|
self.pipeline_options.generate_page_images
|
||||||
or self.pipeline_options.generate_picture_images
|
or self.pipeline_options.generate_picture_images
|
||||||
or self.pipeline_options.generate_table_images
|
or self.pipeline_options.generate_table_images
|
||||||
)
|
)
|
||||||
|
|
||||||
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
||||||
|
|
||||||
@ -197,7 +200,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
elements=all_elements, headers=all_headers, body=all_body
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_res.document = self.glm_model(conv_res)
|
conv_res.document = self.reading_order_model(conv_res)
|
||||||
|
|
||||||
# Generate page images in the output
|
# Generate page images in the output
|
||||||
if self.pipeline_options.generate_page_images:
|
if self.pipeline_options.generate_page_images:
|
||||||
@ -209,6 +212,8 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Generate images of the requested element types
|
# Generate images of the requested element types
|
||||||
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
if (
|
if (
|
||||||
self.pipeline_options.generate_picture_images
|
self.pipeline_options.generate_picture_images
|
||||||
or self.pipeline_options.generate_table_images
|
or self.pipeline_options.generate_table_images
|
||||||
@ -236,7 +241,9 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
crop_bbox = (
|
crop_bbox = (
|
||||||
element.prov[0]
|
element.prov[0]
|
||||||
.bbox.scaled(scale=scale)
|
.bbox.scaled(scale=scale)
|
||||||
.to_top_left_origin(page_height=page.size.height * scale)
|
.to_top_left_origin(
|
||||||
|
page_height=page.size.height * scale
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||||
@ -244,6 +251,36 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
cropped_im, dpi=int(72 * scale)
|
cropped_im, dpi=int(72 * scale)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Aggregate confidence values for document:
|
||||||
|
if len(conv_res.pages) > 0:
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore",
|
||||||
|
category=RuntimeWarning,
|
||||||
|
message="Mean of empty slice|All-NaN slice encountered",
|
||||||
|
)
|
||||||
|
conv_res.confidence.layout_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||||
|
q=0.1, # parse score should relate to worst 10% of pages.
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.table_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.ocr_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -3,7 +3,7 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union, cast
|
from typing import List, Optional, Union, cast
|
||||||
|
|
||||||
# from docling_core.types import DoclingDocument
|
from docling_core.types import DoclingDocument
|
||||||
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
||||||
from docling_core.types.doc.document import DocTagsDocument
|
from docling_core.types.doc.document import DocTagsDocument
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
@ -133,24 +133,22 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||||
doctags_list_c, image_list_c
|
doctags_list_c, image_list_c
|
||||||
)
|
)
|
||||||
conv_res.document.load_from_doctags(doctags_doc)
|
conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
|
||||||
|
|
||||||
# If forced backend text, replace model predicted text with backend one
|
# If forced backend text, replace model predicted text with backend one
|
||||||
if page.size:
|
|
||||||
if self.force_backend_text:
|
if self.force_backend_text:
|
||||||
scale = self.pipeline_options.images_scale
|
scale = self.pipeline_options.images_scale
|
||||||
for element, _level in conv_res.document.iterate_items():
|
for element, _level in conv_res.document.iterate_items():
|
||||||
if (
|
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
||||||
not isinstance(element, TextItem)
|
continue
|
||||||
or len(element.prov) == 0
|
page_ix = element.prov[0].page_no - 1
|
||||||
):
|
page = conv_res.pages[page_ix]
|
||||||
|
if not page.size:
|
||||||
continue
|
continue
|
||||||
crop_bbox = (
|
crop_bbox = (
|
||||||
element.prov[0]
|
element.prov[0]
|
||||||
.bbox.scaled(scale=scale)
|
.bbox.scaled(scale=scale)
|
||||||
.to_top_left_origin(
|
.to_top_left_origin(page_height=page.size.height * scale)
|
||||||
page_height=page.size.height * scale
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
txt = self.extract_text_from_backend(page, crop_bbox)
|
txt = self.extract_text_from_backend(page, crop_bbox)
|
||||||
element.text = txt
|
element.text = txt
|
||||||
|
@ -90,17 +90,12 @@ class SpatialClusterIndex:
|
|||||||
containment_threshold: float,
|
containment_threshold: float,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Check if two bboxes overlap sufficiently."""
|
"""Check if two bboxes overlap sufficiently."""
|
||||||
area1, area2 = bbox1.area(), bbox2.area()
|
if bbox1.area() <= 0 or bbox2.area() <= 0:
|
||||||
if area1 <= 0 or area2 <= 0:
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
overlap_area = bbox1.intersection_area_with(bbox2)
|
iou = bbox1.intersection_over_union(bbox2)
|
||||||
if overlap_area <= 0:
|
containment1 = bbox1.intersection_over_self(bbox2)
|
||||||
return False
|
containment2 = bbox2.intersection_over_self(bbox1)
|
||||||
|
|
||||||
iou = overlap_area / (area1 + area2 - overlap_area)
|
|
||||||
containment1 = overlap_area / area1
|
|
||||||
containment2 = overlap_area / area2
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
iou > overlap_threshold
|
iou > overlap_threshold
|
||||||
@ -321,9 +316,7 @@ class LayoutPostprocessor:
|
|||||||
for special in special_clusters:
|
for special in special_clusters:
|
||||||
contained = []
|
contained = []
|
||||||
for cluster in self.regular_clusters:
|
for cluster in self.regular_clusters:
|
||||||
overlap = cluster.bbox.intersection_area_with(special.bbox)
|
containment = cluster.bbox.intersection_over_self(special.bbox)
|
||||||
if overlap > 0:
|
|
||||||
containment = overlap / cluster.bbox.area()
|
|
||||||
if containment > 0.8:
|
if containment > 0.8:
|
||||||
contained.append(cluster)
|
contained.append(cluster)
|
||||||
|
|
||||||
@ -379,9 +372,7 @@ class LayoutPostprocessor:
|
|||||||
for regular in self.regular_clusters:
|
for regular in self.regular_clusters:
|
||||||
if regular.label == DocItemLabel.TABLE:
|
if regular.label == DocItemLabel.TABLE:
|
||||||
# Calculate overlap
|
# Calculate overlap
|
||||||
overlap = regular.bbox.intersection_area_with(wrapper.bbox)
|
overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
|
||||||
wrapper_area = wrapper.bbox.area()
|
|
||||||
overlap_ratio = overlap / wrapper_area
|
|
||||||
|
|
||||||
conf_diff = wrapper.confidence - regular.confidence
|
conf_diff = wrapper.confidence - regular.confidence
|
||||||
|
|
||||||
@ -421,8 +412,7 @@ class LayoutPostprocessor:
|
|||||||
# Rule 2: CODE vs others
|
# Rule 2: CODE vs others
|
||||||
if candidate.label == DocItemLabel.CODE:
|
if candidate.label == DocItemLabel.CODE:
|
||||||
# Calculate how much of the other cluster is contained within the CODE cluster
|
# Calculate how much of the other cluster is contained within the CODE cluster
|
||||||
overlap = other.bbox.intersection_area_with(candidate.bbox)
|
containment = other.bbox.intersection_over_self(candidate.bbox)
|
||||||
containment = overlap / other.bbox.area()
|
|
||||||
if containment > 0.8: # other is 80% contained within CODE
|
if containment > 0.8: # other is 80% contained within CODE
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -586,11 +576,9 @@ class LayoutPostprocessor:
|
|||||||
if cell.rect.to_bounding_box().area() <= 0:
|
if cell.rect.to_bounding_box().area() <= 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
overlap = cell.rect.to_bounding_box().intersection_area_with(
|
overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
|
||||||
cluster.bbox
|
cluster.bbox
|
||||||
)
|
)
|
||||||
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
|
||||||
|
|
||||||
if overlap_ratio > best_overlap:
|
if overlap_ratio > best_overlap:
|
||||||
best_overlap = overlap_ratio
|
best_overlap = overlap_ratio
|
||||||
best_cluster = cluster
|
best_cluster = cluster
|
||||||
|
@ -4,12 +4,15 @@ from typing import Optional
|
|||||||
|
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
granite_picture_description,
|
granite_picture_description,
|
||||||
|
smoldocling_vlm_conversion_options,
|
||||||
|
smoldocling_vlm_mlx_conversion_options,
|
||||||
smolvlm_picture_description,
|
smolvlm_picture_description,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.code_formula_model import CodeFormulaModel
|
from docling.models.code_formula_model import CodeFormulaModel
|
||||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
@ -27,6 +30,8 @@ def download_models(
|
|||||||
with_code_formula: bool = True,
|
with_code_formula: bool = True,
|
||||||
with_picture_classifier: bool = True,
|
with_picture_classifier: bool = True,
|
||||||
with_smolvlm: bool = False,
|
with_smolvlm: bool = False,
|
||||||
|
with_smoldocling: bool = False,
|
||||||
|
with_smoldocling_mlx: bool = False,
|
||||||
with_granite_vision: bool = False,
|
with_granite_vision: bool = False,
|
||||||
with_easyocr: bool = True,
|
with_easyocr: bool = True,
|
||||||
):
|
):
|
||||||
@ -77,6 +82,25 @@ def download_models(
|
|||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if with_smoldocling:
|
||||||
|
_log.info("Downloading SmolDocling model...")
|
||||||
|
HuggingFaceVlmModel.download_models(
|
||||||
|
repo_id=smoldocling_vlm_conversion_options.repo_id,
|
||||||
|
local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
|
||||||
|
force=force,
|
||||||
|
progress=progress,
|
||||||
|
)
|
||||||
|
|
||||||
|
if with_smoldocling_mlx:
|
||||||
|
_log.info("Downloading SmolDocling MLX model...")
|
||||||
|
HuggingFaceVlmModel.download_models(
|
||||||
|
repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
|
||||||
|
local_dir=output_dir
|
||||||
|
/ smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
|
||||||
|
force=force,
|
||||||
|
progress=progress,
|
||||||
|
)
|
||||||
|
|
||||||
if with_granite_vision:
|
if with_granite_vision:
|
||||||
_log.info("Downloading Granite Vision model...")
|
_log.info("Downloading Granite Vision model...")
|
||||||
PictureDescriptionVlmModel.download_models(
|
PictureDescriptionVlmModel.download_models(
|
||||||
|
@ -1,3 +1,11 @@
|
|||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle
|
||||||
|
|
||||||
|
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
||||||
|
|
||||||
|
|
||||||
def map_tesseract_script(script: str) -> str:
|
def map_tesseract_script(script: str) -> str:
|
||||||
r""" """
|
r""" """
|
||||||
if script == "Katakana" or script == "Hiragana":
|
if script == "Katakana" or script == "Hiragana":
|
||||||
@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
|
|||||||
elif script == "Korean":
|
elif script == "Korean":
|
||||||
script = "Hangul"
|
script = "Hangul"
|
||||||
return script
|
return script
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tesseract_orientation(orientation: str) -> int:
|
||||||
|
# Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
|
||||||
|
# are [0, 360[ counterclockwise
|
||||||
|
parsed = int(orientation)
|
||||||
|
if parsed not in CLIPPED_ORIENTATIONS:
|
||||||
|
msg = (
|
||||||
|
f"invalid tesseract document orientation {orientation}, "
|
||||||
|
f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
parsed = -parsed
|
||||||
|
parsed %= 360
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
|
||||||
|
def tesseract_box_to_bounding_rectangle(
|
||||||
|
bbox: BoundingBox,
|
||||||
|
*,
|
||||||
|
original_offset: Optional[BoundingBox] = None,
|
||||||
|
scale: float,
|
||||||
|
orientation: int,
|
||||||
|
im_size: Tuple[int, int],
|
||||||
|
) -> BoundingRectangle:
|
||||||
|
# box is in the top, left, height, width format, top left coordinates
|
||||||
|
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
|
||||||
|
rect = BoundingRectangle(
|
||||||
|
r_x0=rect.r_x0 / scale,
|
||||||
|
r_y0=rect.r_y0 / scale,
|
||||||
|
r_x1=rect.r_x1 / scale,
|
||||||
|
r_y1=rect.r_y1 / scale,
|
||||||
|
r_x2=rect.r_x2 / scale,
|
||||||
|
r_y2=rect.r_y2 / scale,
|
||||||
|
r_x3=rect.r_x3 / scale,
|
||||||
|
r_y3=rect.r_y3 / scale,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
if original_offset is not None:
|
||||||
|
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
|
||||||
|
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
|
||||||
|
raise ValueError(msg)
|
||||||
|
if original_offset is not None:
|
||||||
|
rect.r_x0 += original_offset.l
|
||||||
|
rect.r_x1 += original_offset.l
|
||||||
|
rect.r_x2 += original_offset.l
|
||||||
|
rect.r_x3 += original_offset.l
|
||||||
|
rect.r_y0 += original_offset.t
|
||||||
|
rect.r_y1 += original_offset.t
|
||||||
|
rect.r_y2 += original_offset.t
|
||||||
|
rect.r_y3 += original_offset.t
|
||||||
|
return rect
|
||||||
|
71
docling/utils/orientation.py
Normal file
71
docling/utils/orientation.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import BoundingRectangle
|
||||||
|
|
||||||
|
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_bounding_box(
|
||||||
|
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
||||||
|
) -> BoundingRectangle:
|
||||||
|
# The box is left top width height in TOPLEFT coordinates
|
||||||
|
# Bounding rectangle start with r_0 at the bottom left whatever the
|
||||||
|
# coordinate system. Then other corners are found rotating counterclockwise
|
||||||
|
bbox = bbox.to_top_left_origin(im_size[1])
|
||||||
|
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
|
||||||
|
im_h, im_w = im_size
|
||||||
|
angle = angle % 360
|
||||||
|
if angle == 0:
|
||||||
|
r_x0 = left
|
||||||
|
r_y0 = top + height
|
||||||
|
r_x1 = r_x0 + width
|
||||||
|
r_y1 = r_y0
|
||||||
|
r_x2 = r_x0 + width
|
||||||
|
r_y2 = r_y0 - height
|
||||||
|
r_x3 = r_x0
|
||||||
|
r_y3 = r_y0 - height
|
||||||
|
elif angle == 90:
|
||||||
|
r_x0 = im_w - (top + height)
|
||||||
|
r_y0 = left
|
||||||
|
r_x1 = r_x0
|
||||||
|
r_y1 = r_y0 + width
|
||||||
|
r_x2 = r_x0 + height
|
||||||
|
r_y2 = r_y0 + width
|
||||||
|
r_x3 = r_x0
|
||||||
|
r_y3 = r_y0 + width
|
||||||
|
elif angle == 180:
|
||||||
|
r_x0 = im_h - left
|
||||||
|
r_y0 = im_w - (top + height)
|
||||||
|
r_x1 = r_x0 - width
|
||||||
|
r_y1 = r_y0
|
||||||
|
r_x2 = r_x0 - width
|
||||||
|
r_y2 = r_y0 + height
|
||||||
|
r_x3 = r_x0
|
||||||
|
r_y3 = r_y0 + height
|
||||||
|
elif angle == 270:
|
||||||
|
r_x0 = top + height
|
||||||
|
r_y0 = im_h - left
|
||||||
|
r_x1 = r_x0
|
||||||
|
r_y1 = r_y0 - width
|
||||||
|
r_x2 = r_x0 - height
|
||||||
|
r_y2 = r_y0 - width
|
||||||
|
r_x3 = r_x0 - height
|
||||||
|
r_y3 = r_y0
|
||||||
|
else:
|
||||||
|
msg = (
|
||||||
|
f"invalid orientation {angle}, expected values in:"
|
||||||
|
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
return BoundingRectangle(
|
||||||
|
r_x0=r_x0,
|
||||||
|
r_y0=r_y0,
|
||||||
|
r_x1=r_x1,
|
||||||
|
r_y1=r_y1,
|
||||||
|
r_x2=r_x2,
|
||||||
|
r_y2=r_y2,
|
||||||
|
r_x3=r_x3,
|
||||||
|
r_y3=r_y3,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
3
docs/concepts/architecture.md
vendored
3
docs/concepts/architecture.md
vendored
@ -10,7 +10,8 @@ For each document format, the *document converter* knows which format-specific *
|
|||||||
|
|
||||||
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
|
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
|
||||||
|
|
||||||
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md).
|
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it serialized by a
|
||||||
|
[*serializer*](./serialization.md) or chunked by a [*chunker*](./chunking.md).
|
||||||
|
|
||||||
For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
||||||
|
|
||||||
|
13
docs/concepts/chunking.md
vendored
13
docs/concepts/chunking.md
vendored
@ -31,7 +31,7 @@ The `BaseChunker` base class API defines that any chunker should provide the fol
|
|||||||
|
|
||||||
- `def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]`:
|
- `def chunk(self, dl_doc: DoclingDocument, **kwargs) -> Iterator[BaseChunk]`:
|
||||||
Returning the chunks for the provided document.
|
Returning the chunks for the provided document.
|
||||||
- `def serialize(self, chunk: BaseChunk) -> str`:
|
- `def contextualize(self, chunk: BaseChunk) -> str`:
|
||||||
Returning the potentially metadata-enriched serialization of the chunk, typically
|
Returning the potentially metadata-enriched serialization of the chunk, typically
|
||||||
used to feed an embedding model (or generation model).
|
used to feed an embedding model (or generation model).
|
||||||
|
|
||||||
@ -44,10 +44,14 @@ The `BaseChunker` base class API defines that any chunker should provide the fol
|
|||||||
from docling.chunking import HybridChunker
|
from docling.chunking import HybridChunker
|
||||||
```
|
```
|
||||||
- If you are only using the `docling-core` package, you must ensure to install
|
- If you are only using the `docling-core` package, you must ensure to install
|
||||||
the `chunking` extra, e.g.
|
the `chunking` extra if you want to use HuggingFace tokenizers, e.g.
|
||||||
```shell
|
```shell
|
||||||
pip install 'docling-core[chunking]'
|
pip install 'docling-core[chunking]'
|
||||||
```
|
```
|
||||||
|
or the `chunking-openai` extra if you prefer Open AI tokenizers (tiktoken), e.g.
|
||||||
|
```shell
|
||||||
|
pip install 'docling-core[chunking-openai]'
|
||||||
|
```
|
||||||
and then you
|
and then you
|
||||||
can import as follows:
|
can import as follows:
|
||||||
```python
|
```python
|
||||||
@ -67,7 +71,10 @@ tokens), &
|
|||||||
chunks with same headings & captions) — users can opt out of this step via param
|
chunks with same headings & captions) — users can opt out of this step via param
|
||||||
`merge_peers` (by default `True`)
|
`merge_peers` (by default `True`)
|
||||||
|
|
||||||
👉 Example: see [here](../examples/hybrid_chunking.ipynb).
|
👉 Usage examples:
|
||||||
|
|
||||||
|
- [Hybrid chunking](../examples/hybrid_chunking.ipynb)
|
||||||
|
- [Advanced chunking & serialization](../examples/advanced_chunking_and_serialization.ipynb)
|
||||||
|
|
||||||
## Hierarchical Chunker
|
## Hierarchical Chunker
|
||||||
|
|
||||||
|
40
docs/concepts/serialization.md
vendored
Normal file
40
docs/concepts/serialization.md
vendored
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
## Introduction
|
||||||
|
|
||||||
|
A *document serializer* (AKA simply *serializer*) is a Docling abstraction that is
|
||||||
|
initialized with a given [`DoclingDocument`](./docling_document.md) and returns a
|
||||||
|
textual representation for that document.
|
||||||
|
|
||||||
|
Besides the document serializer, Docling defines similar abstractions for several
|
||||||
|
document subcomponents, for example: *text serializer*, *table serializer*,
|
||||||
|
*picture serializer*, *list serializer*, *inline serializer*, and more.
|
||||||
|
|
||||||
|
Last but not least, a *serializer provider* is a wrapper that abstracts the
|
||||||
|
document serialization strategy from the document instance.
|
||||||
|
|
||||||
|
## Base classes
|
||||||
|
|
||||||
|
To enable both flexibility for downstream applications and out-of-the-box utility,
|
||||||
|
Docling defines a serialization class hierarchy, providing:
|
||||||
|
|
||||||
|
- base types for the above abstractions: `BaseDocSerializer`, as well as
|
||||||
|
`BaseTextSerializer`, `BaseTableSerializer` etc, and `BaseSerializerProvider`, and
|
||||||
|
- specific subclasses for the above-mentioned base types, e.g. `MarkdownDocSerializer`.
|
||||||
|
|
||||||
|
You can review all methods required to define the above base classes [here](https://github.com/docling-project/docling-core/blob/main/docling_core/transforms/serializer/base.py).
|
||||||
|
|
||||||
|
From a client perspective, the most relevant is `BaseDocSerializer.serialize()`, which
|
||||||
|
returns the textual representation, as well as relevant metadata on which document
|
||||||
|
components contributed to that serialization.
|
||||||
|
|
||||||
|
## Use in `DoclingDocument` export methods
|
||||||
|
|
||||||
|
Docling provides predefined serializers for Markdown, HTML, and DocTags.
|
||||||
|
|
||||||
|
The respective `DoclingDocument` export methods (e.g. `export_to_markdown()`) are
|
||||||
|
provided as user shorthands — internally directly instantiating and delegating to
|
||||||
|
respective serializers.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
For an example showcasing how to use serializers, see
|
||||||
|
[here](../examples/serialization.ipynb).
|
559
docs/examples/advanced_chunking_and_serialization.ipynb
vendored
Normal file
559
docs/examples/advanced_chunking_and_serialization.ipynb
vendored
Normal file
@ -0,0 +1,559 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Advanced chunking & serialization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Overview"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In this notebook we show how to customize the serialization strategies that come into\n",
|
||||||
|
"play during chunking."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setup"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We will work with a document that contains some [picture annotations](../pictures_description):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.types.doc.document import DoclingDocument\n",
|
||||||
|
"\n",
|
||||||
|
"SOURCE = \"./data/2408.09869v3_enriched.json\"\n",
|
||||||
|
"\n",
|
||||||
|
"doc = DoclingDocument.load_from_json(SOURCE)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we define the chunker (for more details check out [Hybrid Chunking](../hybrid_chunking)):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.chunker.hybrid_chunker import HybridChunker\n",
|
||||||
|
"from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer\n",
|
||||||
|
"from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n",
|
||||||
|
"from transformers import AutoTokenizer\n",
|
||||||
|
"\n",
|
||||||
|
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||||
|
"\n",
|
||||||
|
"tokenizer: BaseTokenizer = HuggingFaceTokenizer(\n",
|
||||||
|
" tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),\n",
|
||||||
|
")\n",
|
||||||
|
"chunker = HybridChunker(tokenizer=tokenizer)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"tokenizer.get_max_tokens()=512\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(f\"{tokenizer.get_max_tokens()=}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Defining some helper methods:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Iterable, Optional\n",
|
||||||
|
"\n",
|
||||||
|
"from docling_core.transforms.chunker.base import BaseChunk\n",
|
||||||
|
"from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n",
|
||||||
|
"from docling_core.types.doc.labels import DocItemLabel\n",
|
||||||
|
"from rich.console import Console\n",
|
||||||
|
"from rich.panel import Panel\n",
|
||||||
|
"\n",
|
||||||
|
"console = Console(\n",
|
||||||
|
" width=200, # for getting Markdown tables rendered nicely\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def find_n_th_chunk_with_label(\n",
|
||||||
|
" iter: Iterable[BaseChunk], n: int, label: DocItemLabel\n",
|
||||||
|
") -> Optional[DocChunk]:\n",
|
||||||
|
" num_found = -1\n",
|
||||||
|
" for i, chunk in enumerate(iter):\n",
|
||||||
|
" doc_chunk = DocChunk.model_validate(chunk)\n",
|
||||||
|
" for it in doc_chunk.meta.doc_items:\n",
|
||||||
|
" if it.label == label:\n",
|
||||||
|
" num_found += 1\n",
|
||||||
|
" if num_found == n:\n",
|
||||||
|
" return i, chunk\n",
|
||||||
|
" return None, None\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def print_chunk(chunks, chunk_pos):\n",
|
||||||
|
" chunk = chunks[chunk_pos]\n",
|
||||||
|
" ctx_text = chunker.contextualize(chunk=chunk)\n",
|
||||||
|
" num_tokens = tokenizer.count_tokens(text=ctx_text)\n",
|
||||||
|
" doc_items_refs = [it.self_ref for it in chunk.meta.doc_items]\n",
|
||||||
|
" title = f\"{chunk_pos=} {num_tokens=} {doc_items_refs=}\"\n",
|
||||||
|
" console.print(Panel(ctx_text, title=title))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Table serialization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using the default strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we inspect the first chunk containing a table — using the default serialization strategy:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, │\n",
|
||||||
|
"│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n",
|
||||||
|
"│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16 │\n",
|
||||||
|
"│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium │\n",
|
||||||
|
"│ backend.Mem = 2.42 GB │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, │\n",
|
||||||
|
"│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n",
|
||||||
|
"│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16 │\n",
|
||||||
|
"│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium │\n",
|
||||||
|
"│ backend.Mem = 2.42 GB │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chunker = HybridChunker(tokenizer=tokenizer)\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-info\">\n",
|
||||||
|
" <strong>INFO</strong>: As you see above, using the <code>HybridChunker</code> can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check <a href=\"https://docling-project.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model\">here</a>.\n",
|
||||||
|
"</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Configuring a different strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can configure a different serialization strategy. In the example below, we specify a different table serializer that serializes tables to Markdown instead of the triplet notation used by default:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | │\n",
|
||||||
|
"│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| │\n",
|
||||||
|
"│ | | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | │\n",
|
||||||
|
"│ | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | │\n",
|
||||||
|
"│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | │\n",
|
||||||
|
"│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| │\n",
|
||||||
|
"│ | | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | │\n",
|
||||||
|
"│ | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | │\n",
|
||||||
|
"│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.chunker.hierarchical_chunker import (\n",
|
||||||
|
" ChunkingDocSerializer,\n",
|
||||||
|
" ChunkingSerializerProvider,\n",
|
||||||
|
")\n",
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownTableSerializer\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class MDTableSerializerProvider(ChunkingSerializerProvider):\n",
|
||||||
|
" def get_serializer(self, doc):\n",
|
||||||
|
" return ChunkingDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" table_serializer=MarkdownTableSerializer(), # configuring a different table serializer\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HybridChunker(\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" serializer_provider=MDTableSerializerProvider(),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Picture serialization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using the default strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we inspect the first chunk containing a picture.\n",
|
||||||
|
"\n",
|
||||||
|
"Even when using the default strategy, we can modify the relevant parameters, e.g. which placeholder is used for pictures:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ <!-- image --> │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ <!-- image --> │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownParams\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class ImgPlaceholderSerializerProvider(ChunkingSerializerProvider):\n",
|
||||||
|
" def get_serializer(self, doc):\n",
|
||||||
|
" return ChunkingDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" params=MarkdownParams(\n",
|
||||||
|
" image_placeholder=\"<!-- image -->\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HybridChunker(\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" serializer_provider=ImgPlaceholderSerializerProvider(),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using a custom strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we define and use our custom picture serialization strategy which leverages picture annotations:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Any\n",
|
||||||
|
"\n",
|
||||||
|
"from docling_core.transforms.serializer.base import (\n",
|
||||||
|
" BaseDocSerializer,\n",
|
||||||
|
" SerializationResult,\n",
|
||||||
|
")\n",
|
||||||
|
"from docling_core.transforms.serializer.common import create_ser_result\n",
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownPictureSerializer\n",
|
||||||
|
"from docling_core.types.doc.document import (\n",
|
||||||
|
" PictureClassificationData,\n",
|
||||||
|
" PictureDescriptionData,\n",
|
||||||
|
" PictureItem,\n",
|
||||||
|
" PictureMoleculeData,\n",
|
||||||
|
")\n",
|
||||||
|
"from typing_extensions import override\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class AnnotationPictureSerializer(MarkdownPictureSerializer):\n",
|
||||||
|
" @override\n",
|
||||||
|
" def serialize(\n",
|
||||||
|
" self,\n",
|
||||||
|
" *,\n",
|
||||||
|
" item: PictureItem,\n",
|
||||||
|
" doc_serializer: BaseDocSerializer,\n",
|
||||||
|
" doc: DoclingDocument,\n",
|
||||||
|
" **kwargs: Any,\n",
|
||||||
|
" ) -> SerializationResult:\n",
|
||||||
|
" text_parts: list[str] = []\n",
|
||||||
|
" for annotation in item.annotations:\n",
|
||||||
|
" if isinstance(annotation, PictureClassificationData):\n",
|
||||||
|
" predicted_class = (\n",
|
||||||
|
" annotation.predicted_classes[0].class_name\n",
|
||||||
|
" if annotation.predicted_classes\n",
|
||||||
|
" else None\n",
|
||||||
|
" )\n",
|
||||||
|
" if predicted_class is not None:\n",
|
||||||
|
" text_parts.append(f\"Picture type: {predicted_class}\")\n",
|
||||||
|
" elif isinstance(annotation, PictureMoleculeData):\n",
|
||||||
|
" text_parts.append(f\"SMILES: {annotation.smi}\")\n",
|
||||||
|
" elif isinstance(annotation, PictureDescriptionData):\n",
|
||||||
|
" text_parts.append(f\"Picture description: {annotation.text}\")\n",
|
||||||
|
"\n",
|
||||||
|
" text_res = \"\\n\".join(text_parts)\n",
|
||||||
|
" text_res = doc_serializer.post_process(text=text_res)\n",
|
||||||
|
" return create_ser_result(text=text_res, span_source=item)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ Picture description: In this image we can see a cartoon image of a duck holding a paper. │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ Picture description: In this image we can see a cartoon image of a duck holding a paper. │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"class ImgAnnotationSerializerProvider(ChunkingSerializerProvider):\n",
|
||||||
|
" def get_serializer(self, doc: DoclingDocument):\n",
|
||||||
|
" return ChunkingDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" picture_serializer=AnnotationPictureSerializer(), # configuring a different picture serializer\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HybridChunker(\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" serializer_provider=ImgAnnotationSerializerProvider(),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
45197
docs/examples/data/2408.09869v3_enriched.json
vendored
Normal file
45197
docs/examples/data/2408.09869v3_enriched.json
vendored
Normal file
File diff suppressed because one or more lines are too long
173
docs/examples/hybrid_chunking.ipynb
vendored
173
docs/examples/hybrid_chunking.ipynb
vendored
@ -44,14 +44,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install -qU docling transformers"
|
"%pip install -qU pip docling transformers"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Conversion"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -59,11 +52,32 @@
|
|||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"DOC_SOURCE = \"../../tests/data/md/wiki.md\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Basic usage"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We first convert the document:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n",
|
|
||||||
"\n",
|
|
||||||
"doc = DocumentConverter().convert(source=DOC_SOURCE).document"
|
"doc = DocumentConverter().convert(source=DOC_SOURCE).document"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -71,17 +85,13 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Chunking\n",
|
"For a basic chunking scenario, we can just instantiate a `HybridChunker`, which will use\n",
|
||||||
"\n",
|
|
||||||
"### Basic usage\n",
|
|
||||||
"\n",
|
|
||||||
"For a basic usage scenario, we can just instantiate a `HybridChunker`, which will use\n",
|
|
||||||
"the default parameters."
|
"the default parameters."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -111,12 +121,12 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Note that the text you would typically want to embed is the context-enriched one as\n",
|
"Note that the text you would typically want to embed is the context-enriched one as\n",
|
||||||
"returned by the `serialize()` method:"
|
"returned by the `contextualize()` method:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -126,25 +136,25 @@
|
|||||||
"=== 0 ===\n",
|
"=== 0 ===\n",
|
||||||
"chunk.text:\n",
|
"chunk.text:\n",
|
||||||
"'International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Aver…'\n",
|
"'International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Aver…'\n",
|
||||||
"chunker.serialize(chunk):\n",
|
"chunker.contextualize(chunk):\n",
|
||||||
"'IBM\\nInternational Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial …'\n",
|
"'IBM\\nInternational Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial …'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 1 ===\n",
|
"=== 1 ===\n",
|
||||||
"chunk.text:\n",
|
"chunk.text:\n",
|
||||||
"'IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889);[19] and Willa…'\n",
|
"'IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889);[19] and Willa…'\n",
|
||||||
"chunker.serialize(chunk):\n",
|
"chunker.contextualize(chunk):\n",
|
||||||
"'IBM\\n1910s–1950s\\nIBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889…'\n",
|
"'IBM\\n1910s–1950s\\nIBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine (1889…'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 2 ===\n",
|
"=== 2 ===\n",
|
||||||
"chunk.text:\n",
|
"chunk.text:\n",
|
||||||
"'Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson,…'\n",
|
"'Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson,…'\n",
|
||||||
"chunker.serialize(chunk):\n",
|
"chunker.contextualize(chunk):\n",
|
||||||
"'IBM\\n1910s–1950s\\nCollectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John …'\n",
|
"'IBM\\n1910s–1950s\\nCollectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John …'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 3 ===\n",
|
"=== 3 ===\n",
|
||||||
"chunk.text:\n",
|
"chunk.text:\n",
|
||||||
"'In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.…'\n",
|
"'In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.…'\n",
|
||||||
"chunker.serialize(chunk):\n",
|
"chunker.contextualize(chunk):\n",
|
||||||
"'IBM\\n1960s–1980s\\nIn 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.…'\n",
|
"'IBM\\n1960s–1980s\\nIn 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.…'\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
@ -155,8 +165,8 @@
|
|||||||
" print(f\"=== {i} ===\")\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
|
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
" enriched_text = chunker.contextualize(chunk=chunk)\n",
|
||||||
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
|
" print(f\"chunker.contextualize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
@ -165,23 +175,23 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Advanced usage\n",
|
"## Configuring tokenization\n",
|
||||||
"\n",
|
"\n",
|
||||||
"For more control on the chunking, we can parametrize through the `HybridChunker`\n",
|
"For more control on the chunking, we can parametrize tokenization as shown below.\n",
|
||||||
"arguments illustrated below.\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"Notice how `tokenizer` and `embed_model` further below are single-sourced from\n",
|
"In a RAG / retrieval context, it is important to make sure that the chunker and\n",
|
||||||
"`EMBED_MODEL_ID`.\n",
|
"embedding model are using the same tokenizer.\n",
|
||||||
"This is important for making sure the chunker and the embedding model are using the same\n",
|
"\n",
|
||||||
"tokenizer."
|
"👉 HuggingFace transformers tokenizers can be used as shown in the following example:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n",
|
||||||
"from transformers import AutoTokenizer\n",
|
"from transformers import AutoTokenizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.chunking import HybridChunker\n",
|
"from docling.chunking import HybridChunker\n",
|
||||||
@ -189,11 +199,50 @@
|
|||||||
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||||
"MAX_TOKENS = 64 # set to a small number for illustrative purposes\n",
|
"MAX_TOKENS = 64 # set to a small number for illustrative purposes\n",
|
||||||
"\n",
|
"\n",
|
||||||
"tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n",
|
"tokenizer = HuggingFaceTokenizer(\n",
|
||||||
|
" tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),\n",
|
||||||
|
" max_tokens=MAX_TOKENS, # optional, by default derived from `tokenizer` for HF case\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"👉 Alternatively, [OpenAI tokenizers](https://github.com/openai/tiktoken) can be used as shown in the example below (uncomment to use — requires installing `docling-core[chunking-openai]`):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# import tiktoken\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer\n",
|
||||||
|
"\n",
|
||||||
|
"# tokenizer = OpenAITokenizer(\n",
|
||||||
|
"# tokenizer=tiktoken.encoding_for_model(\"gpt-4o\"),\n",
|
||||||
|
"# max_tokens=128 * 1024, # context window length required for OpenAI tokenizers\n",
|
||||||
|
"# )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can now instantiate our chunker:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
"chunker = HybridChunker(\n",
|
"chunker = HybridChunker(\n",
|
||||||
" tokenizer=tokenizer, # instance or model name, defaults to \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
" tokenizer=tokenizer,\n",
|
||||||
" max_tokens=MAX_TOKENS, # optional, by default derived from `tokenizer`\n",
|
|
||||||
" merge_peers=True, # optional, defaults to True\n",
|
" merge_peers=True, # optional, defaults to True\n",
|
||||||
")\n",
|
")\n",
|
||||||
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
@ -213,7 +262,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -223,127 +272,127 @@
|
|||||||
"=== 0 ===\n",
|
"=== 0 ===\n",
|
||||||
"chunk.text (55 tokens):\n",
|
"chunk.text (55 tokens):\n",
|
||||||
"'International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'\n",
|
"'International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'\n",
|
||||||
"chunker.serialize(chunk) (56 tokens):\n",
|
"chunker.contextualize(chunk) (56 tokens):\n",
|
||||||
"'IBM\\nInternational Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'\n",
|
"'IBM\\nInternational Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 1 ===\n",
|
"=== 1 ===\n",
|
||||||
"chunk.text (45 tokens):\n",
|
"chunk.text (45 tokens):\n",
|
||||||
"'IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.'\n",
|
"'IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.'\n",
|
||||||
"chunker.serialize(chunk) (46 tokens):\n",
|
"chunker.contextualize(chunk) (46 tokens):\n",
|
||||||
"'IBM\\nIBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.'\n",
|
"'IBM\\nIBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 2 ===\n",
|
"=== 2 ===\n",
|
||||||
"chunk.text (63 tokens):\n",
|
"chunk.text (63 tokens):\n",
|
||||||
"'IBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the'\n",
|
"'IBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the'\n",
|
||||||
"chunker.serialize(chunk) (64 tokens):\n",
|
"chunker.contextualize(chunk) (64 tokens):\n",
|
||||||
"'IBM\\nIBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the'\n",
|
"'IBM\\nIBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 3 ===\n",
|
"=== 3 ===\n",
|
||||||
"chunk.text (44 tokens):\n",
|
"chunk.text (44 tokens):\n",
|
||||||
"\"IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]\"\n",
|
"\"IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]\"\n",
|
||||||
"chunker.serialize(chunk) (45 tokens):\n",
|
"chunker.contextualize(chunk) (45 tokens):\n",
|
||||||
"\"IBM\\nIBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]\"\n",
|
"\"IBM\\nIBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 4 ===\n",
|
"=== 4 ===\n",
|
||||||
"chunk.text (63 tokens):\n",
|
"chunk.text (63 tokens):\n",
|
||||||
"'IBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, — its DOS software provided by Microsoft, — which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s,'\n",
|
"'IBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, — its DOS software provided by Microsoft, — which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s,'\n",
|
||||||
"chunker.serialize(chunk) (64 tokens):\n",
|
"chunker.contextualize(chunk) (64 tokens):\n",
|
||||||
"'IBM\\nIBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, — its DOS software provided by Microsoft, — which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s,'\n",
|
"'IBM\\nIBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, — its DOS software provided by Microsoft, — which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s,'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 5 ===\n",
|
"=== 5 ===\n",
|
||||||
"chunk.text (61 tokens):\n",
|
"chunk.text (61 tokens):\n",
|
||||||
"'IBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.'\n",
|
"'IBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.'\n",
|
||||||
"chunker.serialize(chunk) (62 tokens):\n",
|
"chunker.contextualize(chunk) (62 tokens):\n",
|
||||||
"'IBM\\nIBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.'\n",
|
"'IBM\\nIBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 6 ===\n",
|
"=== 6 ===\n",
|
||||||
"chunk.text (62 tokens):\n",
|
"chunk.text (62 tokens):\n",
|
||||||
"\"As one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming\"\n",
|
"\"As one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming\"\n",
|
||||||
"chunker.serialize(chunk) (63 tokens):\n",
|
"chunker.contextualize(chunk) (63 tokens):\n",
|
||||||
"\"IBM\\nAs one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming\"\n",
|
"\"IBM\\nAs one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 7 ===\n",
|
"=== 7 ===\n",
|
||||||
"chunk.text (63 tokens):\n",
|
"chunk.text (63 tokens):\n",
|
||||||
"'language, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing'\n",
|
"'language, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing'\n",
|
||||||
"chunker.serialize(chunk) (64 tokens):\n",
|
"chunker.contextualize(chunk) (64 tokens):\n",
|
||||||
"'IBM\\nlanguage, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing'\n",
|
"'IBM\\nlanguage, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 8 ===\n",
|
"=== 8 ===\n",
|
||||||
"chunk.text (5 tokens):\n",
|
"chunk.text (5 tokens):\n",
|
||||||
"'Awards.[16]'\n",
|
"'Awards.[16]'\n",
|
||||||
"chunker.serialize(chunk) (6 tokens):\n",
|
"chunker.contextualize(chunk) (6 tokens):\n",
|
||||||
"'IBM\\nAwards.[16]'\n",
|
"'IBM\\nAwards.[16]'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 9 ===\n",
|
"=== 9 ===\n",
|
||||||
"chunk.text (56 tokens):\n",
|
"chunk.text (56 tokens):\n",
|
||||||
"'IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine'\n",
|
"'IBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine'\n",
|
||||||
"chunker.serialize(chunk) (60 tokens):\n",
|
"chunker.contextualize(chunk) (60 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\nIBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine'\n",
|
"'IBM\\n1910s–1950s\\nIBM originated with several technological innovations developed and commercialized in the late 19th century. Julius E. Pitrap patented the computing scale in 1885;[17] Alexander Dey invented the dial recorder (1888);[18] Herman Hollerith patented the Electric Tabulating Machine'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 10 ===\n",
|
"=== 10 ===\n",
|
||||||
"chunk.text (60 tokens):\n",
|
"chunk.text (60 tokens):\n",
|
||||||
"\"(1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the\"\n",
|
"\"(1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the\"\n",
|
||||||
"chunker.serialize(chunk) (64 tokens):\n",
|
"chunker.contextualize(chunk) (64 tokens):\n",
|
||||||
"\"IBM\\n1910s–1950s\\n(1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the\"\n",
|
"\"IBM\\n1910s–1950s\\n(1889);[19] and Willard Bundy invented a time clock to record workers' arrival and departure times on a paper tape (1889).[20] On June 16, 1911, their four companies were amalgamated in New York State by Charles Ranlett Flint forming a fifth company, the\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 11 ===\n",
|
"=== 11 ===\n",
|
||||||
"chunk.text (59 tokens):\n",
|
"chunk.text (59 tokens):\n",
|
||||||
"'Computing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington,'\n",
|
"'Computing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington,'\n",
|
||||||
"chunker.serialize(chunk) (63 tokens):\n",
|
"chunker.contextualize(chunk) (63 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\nComputing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington,'\n",
|
"'IBM\\n1910s–1950s\\nComputing-Tabulating-Recording Company (CTR) based in Endicott, New York.[1][21] The five companies had 1,300 employees and offices and plants in Endicott and Binghamton, New York; Dayton, Ohio; Detroit, Michigan; Washington,'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 12 ===\n",
|
"=== 12 ===\n",
|
||||||
"chunk.text (13 tokens):\n",
|
"chunk.text (13 tokens):\n",
|
||||||
"'D.C.; and Toronto, Canada.[22]'\n",
|
"'D.C.; and Toronto, Canada.[22]'\n",
|
||||||
"chunker.serialize(chunk) (17 tokens):\n",
|
"chunker.contextualize(chunk) (17 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\nD.C.; and Toronto, Canada.[22]'\n",
|
"'IBM\\n1910s–1950s\\nD.C.; and Toronto, Canada.[22]'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 13 ===\n",
|
"=== 13 ===\n",
|
||||||
"chunk.text (60 tokens):\n",
|
"chunk.text (60 tokens):\n",
|
||||||
"'Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called'\n",
|
"'Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called'\n",
|
||||||
"chunker.serialize(chunk) (64 tokens):\n",
|
"chunker.contextualize(chunk) (64 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\nCollectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called'\n",
|
"'IBM\\n1910s–1950s\\nCollectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 14 ===\n",
|
"=== 14 ===\n",
|
||||||
"chunk.text (59 tokens):\n",
|
"chunk.text (59 tokens):\n",
|
||||||
"\"on Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business\"\n",
|
"\"on Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business\"\n",
|
||||||
"chunker.serialize(chunk) (63 tokens):\n",
|
"chunker.contextualize(chunk) (63 tokens):\n",
|
||||||
"\"IBM\\n1910s–1950s\\non Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business\"\n",
|
"\"IBM\\n1910s–1950s\\non Flint and, in 1914, was offered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 15 ===\n",
|
"=== 15 ===\n",
|
||||||
"chunk.text (23 tokens):\n",
|
"chunk.text (23 tokens):\n",
|
||||||
"\"practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:\\n105\"\n",
|
"\"practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:\\n105\"\n",
|
||||||
"chunker.serialize(chunk) (27 tokens):\n",
|
"chunker.contextualize(chunk) (27 tokens):\n",
|
||||||
"\"IBM\\n1910s–1950s\\npractices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:\\n105\"\n",
|
"\"IBM\\n1910s–1950s\\npractices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:\\n105\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 16 ===\n",
|
"=== 16 ===\n",
|
||||||
"chunk.text (59 tokens):\n",
|
"chunk.text (59 tokens):\n",
|
||||||
"'He implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan,'\n",
|
"'He implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan,'\n",
|
||||||
"chunker.serialize(chunk) (63 tokens):\n",
|
"chunker.contextualize(chunk) (63 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\nHe implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan,'\n",
|
"'IBM\\n1910s–1950s\\nHe implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan,'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 17 ===\n",
|
"=== 17 ===\n",
|
||||||
"chunk.text (60 tokens):\n",
|
"chunk.text (60 tokens):\n",
|
||||||
"'\"THINK\", became a mantra for each company\\'s employees.[25] During Watson\\'s first four years, revenues reached $9 million ($158 million today) and the company\\'s operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the'\n",
|
"'\"THINK\", became a mantra for each company\\'s employees.[25] During Watson\\'s first four years, revenues reached $9 million ($158 million today) and the company\\'s operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the'\n",
|
||||||
"chunker.serialize(chunk) (64 tokens):\n",
|
"chunker.contextualize(chunk) (64 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\n\"THINK\", became a mantra for each company\\'s employees.[25] During Watson\\'s first four years, revenues reached $9 million ($158 million today) and the company\\'s operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the'\n",
|
"'IBM\\n1910s–1950s\\n\"THINK\", became a mantra for each company\\'s employees.[25] During Watson\\'s first four years, revenues reached $9 million ($158 million today) and the company\\'s operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 18 ===\n",
|
"=== 18 ===\n",
|
||||||
"chunk.text (57 tokens):\n",
|
"chunk.text (57 tokens):\n",
|
||||||
"'clumsy hyphenated name \"Computing-Tabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR\\'s Canadian Division;[27] the name was changed on February 14,'\n",
|
"'clumsy hyphenated name \"Computing-Tabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR\\'s Canadian Division;[27] the name was changed on February 14,'\n",
|
||||||
"chunker.serialize(chunk) (61 tokens):\n",
|
"chunker.contextualize(chunk) (61 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\nclumsy hyphenated name \"Computing-Tabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR\\'s Canadian Division;[27] the name was changed on February 14,'\n",
|
"'IBM\\n1910s–1950s\\nclumsy hyphenated name \"Computing-Tabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR\\'s Canadian Division;[27] the name was changed on February 14,'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 19 ===\n",
|
"=== 19 ===\n",
|
||||||
"chunk.text (21 tokens):\n",
|
"chunk.text (21 tokens):\n",
|
||||||
"'1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.'\n",
|
"'1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.'\n",
|
||||||
"chunker.serialize(chunk) (25 tokens):\n",
|
"chunker.contextualize(chunk) (25 tokens):\n",
|
||||||
"'IBM\\n1910s–1950s\\n1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.'\n",
|
"'IBM\\n1910s–1950s\\n1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.'\n",
|
||||||
"\n",
|
"\n",
|
||||||
"=== 20 ===\n",
|
"=== 20 ===\n",
|
||||||
"chunk.text (22 tokens):\n",
|
"chunk.text (22 tokens):\n",
|
||||||
"'In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.'\n",
|
"'In 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.'\n",
|
||||||
"chunker.serialize(chunk) (26 tokens):\n",
|
"chunker.contextualize(chunk) (26 tokens):\n",
|
||||||
"'IBM\\n1960s–1980s\\nIn 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.'\n",
|
"'IBM\\n1960s–1980s\\nIn 1961, IBM developed the SABRE reservation system for American Airlines and introduced the highly successful Selectric typewriter.'\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
@ -352,12 +401,12 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"for i, chunk in enumerate(chunks):\n",
|
"for i, chunk in enumerate(chunks):\n",
|
||||||
" print(f\"=== {i} ===\")\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
" txt_tokens = tokenizer.count_tokens(chunk.text)\n",
|
||||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
|
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
" ser_txt = chunker.contextualize(chunk=chunk)\n",
|
||||||
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
" ser_tokens = tokenizer.count_tokens(ser_txt)\n",
|
||||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
|
" print(f\"chunker.contextualize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
@ -379,7 +428,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.7"
|
"version": "3.13.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
6
docs/examples/minimal.py
vendored
6
docs/examples/minimal.py
vendored
@ -1,7 +1,9 @@
|
|||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||||
|
|
||||||
converter = DocumentConverter()
|
converter = DocumentConverter()
|
||||||
result = converter.convert(source)
|
doc = converter.convert(source).document
|
||||||
print(result.document.export_to_markdown())
|
|
||||||
|
print(doc.export_to_markdown())
|
||||||
# output: ## Docling Technical Report [...]"
|
# output: ## Docling Technical Report [...]"
|
||||||
|
665
docs/examples/serialization.ipynb
vendored
Normal file
665
docs/examples/serialization.ipynb
vendored
Normal file
@ -0,0 +1,665 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Serialization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Overview"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In this notebook we showcase the usage of Docling [serializers](../../concepts/serialization)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setup"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU pip docling docling-core~=2.29 rich"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"DOC_SOURCE = \"https://arxiv.org/pdf/2311.18481\"\n",
|
||||||
|
"\n",
|
||||||
|
"# we set some start-stop cues for defining an excerpt to print\n",
|
||||||
|
"start_cue = \"Copyright © 2024\"\n",
|
||||||
|
"stop_cue = \"Application of NLP to ESG\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from rich.console import Console\n",
|
||||||
|
"from rich.panel import Panel\n",
|
||||||
|
"\n",
|
||||||
|
"console = Console(width=210) # for preventing Markdown table wrapped rendering\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def print_in_console(text):\n",
|
||||||
|
" console.print(Panel(text))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Basic usage"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We first convert the document:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
|
||||||
|
" warnings.warn(warn_msg)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling.document_converter import DocumentConverter\n",
|
||||||
|
"\n",
|
||||||
|
"converter = DocumentConverter()\n",
|
||||||
|
"doc = converter.convert(source=DOC_SOURCE).document"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can now apply any `BaseDocSerializer` on the produced document.\n",
|
||||||
|
"\n",
|
||||||
|
"👉 Note that, to keep the shown output brief, we only print an excerpt.\n",
|
||||||
|
"\n",
|
||||||
|
"E.g. below we apply an `HTMLDocSerializer`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.</p> │\n",
|
||||||
|
"│ <table><tbody><tr><th>Report</th><th>Question</th><th>Answer</th></tr><tr><td>IBM 2022</td><td>How many hours were spent on employee learning in 2021?</td><td>22.5 million hours</td></tr><tr><td>IBM │\n",
|
||||||
|
"│ 2022</td><td>What was the rate of fatalities in 2021?</td><td>The rate of fatalities in 2021 was 0.0016.</td></tr><tr><td>IBM 2022</td><td>How many full audits were con- ducted in 2022 in │\n",
|
||||||
|
"│ India?</td><td>2</td></tr><tr><td>Starbucks 2022</td><td>What is the percentage of women in the Board of Directors?</td><td>25%</td></tr><tr><td>Starbucks 2022</td><td>What was the total energy con- │\n",
|
||||||
|
"│ sumption in 2021?</td><td>According to the table, the total energy consumption in 2021 was 2,491,543 MWh.</td></tr><tr><td>Starbucks 2022</td><td>How much packaging material was made from renewable mate- │\n",
|
||||||
|
"│ rials?</td><td>According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22.</td></tr></tbody></table> │\n",
|
||||||
|
"│ <p>Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system.</p> │\n",
|
||||||
|
"│ <p>ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response.</p> │\n",
|
||||||
|
"│ <h2>Related Work</h2> │\n",
|
||||||
|
"│ <p>The DocQA integrates multiple AI technologies, namely:</p> │\n",
|
||||||
|
"│ <p>Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric │\n",
|
||||||
|
"│ layout analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et │\n",
|
||||||
|
"│ al. 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning-</p> │\n",
|
||||||
|
"│ <figure><figcaption>Figure 1: System architecture: Simplified sketch of document question-answering pipeline.</figcaption></figure> │\n",
|
||||||
|
"│ <p>based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018).</p> │\n",
|
||||||
|
"│ <p> │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.</p> │\n",
|
||||||
|
"│ <table><tbody><tr><th>Report</th><th>Question</th><th>Answer</th></tr><tr><td>IBM 2022</td><td>How many hours were spent on employee learning in 2021?</td><td>22.5 million hours</td></tr><tr><td>IBM │\n",
|
||||||
|
"│ 2022</td><td>What was the rate of fatalities in 2021?</td><td>The rate of fatalities in 2021 was 0.0016.</td></tr><tr><td>IBM 2022</td><td>How many full audits were con- ducted in 2022 in │\n",
|
||||||
|
"│ India?</td><td>2</td></tr><tr><td>Starbucks 2022</td><td>What is the percentage of women in the Board of Directors?</td><td>25%</td></tr><tr><td>Starbucks 2022</td><td>What was the total energy con- │\n",
|
||||||
|
"│ sumption in 2021?</td><td>According to the table, the total energy consumption in 2021 was 2,491,543 MWh.</td></tr><tr><td>Starbucks 2022</td><td>How much packaging material was made from renewable mate- │\n",
|
||||||
|
"│ rials?</td><td>According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22.</td></tr></tbody></table> │\n",
|
||||||
|
"│ <p>Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system.</p> │\n",
|
||||||
|
"│ <p>ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response.</p> │\n",
|
||||||
|
"│ <h2>Related Work</h2> │\n",
|
||||||
|
"│ <p>The DocQA integrates multiple AI technologies, namely:</p> │\n",
|
||||||
|
"│ <p>Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric │\n",
|
||||||
|
"│ layout analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et │\n",
|
||||||
|
"│ al. 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning-</p> │\n",
|
||||||
|
"│ <figure><figcaption>Figure 1: System architecture: Simplified sketch of document question-answering pipeline.</figcaption></figure> │\n",
|
||||||
|
"│ <p>based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018).</p> │\n",
|
||||||
|
"│ <p> │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.serializer.html import HTMLDocSerializer\n",
|
||||||
|
"\n",
|
||||||
|
"serializer = HTMLDocSerializer(doc=doc)\n",
|
||||||
|
"ser_result = serializer.serialize()\n",
|
||||||
|
"ser_text = ser_result.text\n",
|
||||||
|
"\n",
|
||||||
|
"# we here only print an excerpt to keep the output brief:\n",
|
||||||
|
"print_in_console(ser_text[ser_text.find(start_cue) : ser_text.find(stop_cue)])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In the following example, we use a `MarkdownDocSerializer`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | Report | Question | Answer | │\n",
|
||||||
|
"│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------| │\n",
|
||||||
|
"│ | IBM 2022 | How many hours were spent on employee learning in 2021? | 22.5 million hours | │\n",
|
||||||
|
"│ | IBM 2022 | What was the rate of fatalities in 2021? | The rate of fatalities in 2021 was 0.0016. | │\n",
|
||||||
|
"│ | IBM 2022 | How many full audits were con- ducted in 2022 in India? | 2 | │\n",
|
||||||
|
"│ | Starbucks 2022 | What is the percentage of women in the Board of Directors? | 25% | │\n",
|
||||||
|
"│ | Starbucks 2022 | What was the total energy con- sumption in 2021? | According to the table, the total energy consumption in 2021 was 2,491,543 MWh. | │\n",
|
||||||
|
"│ | Starbucks 2022 | How much packaging material was made from renewable mate- rials? | According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22. | │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ## Related Work │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ The DocQA integrates multiple AI technologies, namely: │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
|
||||||
|
"│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al. │\n",
|
||||||
|
"│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning- │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ <!-- image --> │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018). │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | Report | Question | Answer | │\n",
|
||||||
|
"│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------| │\n",
|
||||||
|
"│ | IBM 2022 | How many hours were spent on employee learning in 2021? | 22.5 million hours | │\n",
|
||||||
|
"│ | IBM 2022 | What was the rate of fatalities in 2021? | The rate of fatalities in 2021 was 0.0016. | │\n",
|
||||||
|
"│ | IBM 2022 | How many full audits were con- ducted in 2022 in India? | 2 | │\n",
|
||||||
|
"│ | Starbucks 2022 | What is the percentage of women in the Board of Directors? | 25% | │\n",
|
||||||
|
"│ | Starbucks 2022 | What was the total energy con- sumption in 2021? | According to the table, the total energy consumption in 2021 was 2,491,543 MWh. | │\n",
|
||||||
|
"│ | Starbucks 2022 | How much packaging material was made from renewable mate- rials? | According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22. | │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ## Related Work │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ The DocQA integrates multiple AI technologies, namely: │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
|
||||||
|
"│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al. │\n",
|
||||||
|
"│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning- │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ <!-- image --> │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018). │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownDocSerializer\n",
|
||||||
|
"\n",
|
||||||
|
"serializer = MarkdownDocSerializer(doc=doc)\n",
|
||||||
|
"ser_result = serializer.serialize()\n",
|
||||||
|
"ser_text = ser_result.text\n",
|
||||||
|
"\n",
|
||||||
|
"print_in_console(ser_text[ser_text.find(start_cue) : ser_text.find(stop_cue)])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configuring a serializer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's now assume we would like to reconfigure the Markdown serialization such that:\n",
|
||||||
|
"- it uses a different component serializer, e.g. if we'd prefer tables to be printed in a triplet format (which could potentially improve the vector representation compared to Markdown tables)\n",
|
||||||
|
"- it uses specific user-defined parameters, e.g. if we'd prefer a different image placeholder text than the default one\n",
|
||||||
|
"\n",
|
||||||
|
"Check out the following configuration and notice the serialization differences in the output further below:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ IBM 2022, Question = How many hours were spent on employee learning in 2021?. IBM 2022, Answer = 22.5 million hours. IBM 2022, Question = What was the rate of fatalities in 2021?. IBM 2022, Answer = The │\n",
|
||||||
|
"│ rate of fatalities in 2021 was 0.0016.. IBM 2022, Question = How many full audits were con- ducted in 2022 in India?. IBM 2022, Answer = 2. Starbucks 2022, Question = What is the percentage of women in the │\n",
|
||||||
|
"│ Board of Directors?. Starbucks 2022, Answer = 25%. Starbucks 2022, Question = What was the total energy con- sumption in 2021?. Starbucks 2022, Answer = According to the table, the total energy consumption │\n",
|
||||||
|
"│ in 2021 was 2,491,543 MWh.. Starbucks 2022, Question = How much packaging material was made from renewable mate- rials?. Starbucks 2022, Answer = According to the given data, 31% of packaging materials were │\n",
|
||||||
|
"│ made from recycled or renewable materials in FY22. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ## Related Work │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ The DocQA integrates multiple AI technologies, namely: │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
|
||||||
|
"│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al. │\n",
|
||||||
|
"│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning- │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ <!-- demo picture placeholder --> │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018). │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ IBM 2022, Question = How many hours were spent on employee learning in 2021?. IBM 2022, Answer = 22.5 million hours. IBM 2022, Question = What was the rate of fatalities in 2021?. IBM 2022, Answer = The │\n",
|
||||||
|
"│ rate of fatalities in 2021 was 0.0016.. IBM 2022, Question = How many full audits were con- ducted in 2022 in India?. IBM 2022, Answer = 2. Starbucks 2022, Question = What is the percentage of women in the │\n",
|
||||||
|
"│ Board of Directors?. Starbucks 2022, Answer = 25%. Starbucks 2022, Question = What was the total energy con- sumption in 2021?. Starbucks 2022, Answer = According to the table, the total energy consumption │\n",
|
||||||
|
"│ in 2021 was 2,491,543 MWh.. Starbucks 2022, Question = How much packaging material was made from renewable mate- rials?. Starbucks 2022, Answer = According to the given data, 31% of packaging materials were │\n",
|
||||||
|
"│ made from recycled or renewable materials in FY22. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ## Related Work │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ The DocQA integrates multiple AI technologies, namely: │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
|
||||||
|
"│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al. │\n",
|
||||||
|
"│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning- │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ <!-- demo picture placeholder --> │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018). │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.chunker.hierarchical_chunker import TripletTableSerializer\n",
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownParams\n",
|
||||||
|
"\n",
|
||||||
|
"serializer = MarkdownDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" table_serializer=TripletTableSerializer(),\n",
|
||||||
|
" params=MarkdownParams(\n",
|
||||||
|
" image_placeholder=\"<!-- demo picture placeholder -->\",\n",
|
||||||
|
" # ...\n",
|
||||||
|
" ),\n",
|
||||||
|
")\n",
|
||||||
|
"ser_result = serializer.serialize()\n",
|
||||||
|
"ser_text = ser_result.text\n",
|
||||||
|
"\n",
|
||||||
|
"print_in_console(ser_text[ser_text.find(start_cue) : ser_text.find(stop_cue)])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating a custom serializer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In the examples above, we were able to reuse existing implementations for our desired\n",
|
||||||
|
"serialization strategy, but let's now assume we want to define a custom serialization\n",
|
||||||
|
"logic, e.g. we would like picture serialization to include any available picture\n",
|
||||||
|
"description (captioning) annotations.\n",
|
||||||
|
"\n",
|
||||||
|
"To that end, we first need to revisit our conversion and include all pipeline options\n",
|
||||||
|
"needed for\n",
|
||||||
|
"[picture description enrichment](https://docling-project.github.io/docling/usage/enrichments/#picture-description)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
|
||||||
|
" warnings.warn(warn_msg)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling.datamodel.base_models import InputFormat\n",
|
||||||
|
"from docling.datamodel.pipeline_options import (\n",
|
||||||
|
" PdfPipelineOptions,\n",
|
||||||
|
" PictureDescriptionVlmOptions,\n",
|
||||||
|
")\n",
|
||||||
|
"from docling.document_converter import DocumentConverter, PdfFormatOption\n",
|
||||||
|
"\n",
|
||||||
|
"pipeline_options = PdfPipelineOptions(\n",
|
||||||
|
" do_picture_description=True,\n",
|
||||||
|
" picture_description_options=PictureDescriptionVlmOptions(\n",
|
||||||
|
" repo_id=\"HuggingFaceTB/SmolVLM-256M-Instruct\",\n",
|
||||||
|
" prompt=\"Describe this picture in three to five sentences. Be precise and concise.\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" generate_picture_images=True,\n",
|
||||||
|
" images_scale=2,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"converter = DocumentConverter(\n",
|
||||||
|
" format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}\n",
|
||||||
|
")\n",
|
||||||
|
"doc = converter.convert(source=DOC_SOURCE).document"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can then define our custom picture serializer:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Any, Optional\n",
|
||||||
|
"\n",
|
||||||
|
"from docling_core.transforms.serializer.base import (\n",
|
||||||
|
" BaseDocSerializer,\n",
|
||||||
|
" SerializationResult,\n",
|
||||||
|
")\n",
|
||||||
|
"from docling_core.transforms.serializer.common import create_ser_result\n",
|
||||||
|
"from docling_core.transforms.serializer.markdown import (\n",
|
||||||
|
" MarkdownParams,\n",
|
||||||
|
" MarkdownPictureSerializer,\n",
|
||||||
|
")\n",
|
||||||
|
"from docling_core.types.doc.document import (\n",
|
||||||
|
" DoclingDocument,\n",
|
||||||
|
" ImageRefMode,\n",
|
||||||
|
" PictureDescriptionData,\n",
|
||||||
|
" PictureItem,\n",
|
||||||
|
")\n",
|
||||||
|
"from typing_extensions import override\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class AnnotationPictureSerializer(MarkdownPictureSerializer):\n",
|
||||||
|
" @override\n",
|
||||||
|
" def serialize(\n",
|
||||||
|
" self,\n",
|
||||||
|
" *,\n",
|
||||||
|
" item: PictureItem,\n",
|
||||||
|
" doc_serializer: BaseDocSerializer,\n",
|
||||||
|
" doc: DoclingDocument,\n",
|
||||||
|
" separator: Optional[str] = None,\n",
|
||||||
|
" **kwargs: Any,\n",
|
||||||
|
" ) -> SerializationResult:\n",
|
||||||
|
" text_parts: list[str] = []\n",
|
||||||
|
"\n",
|
||||||
|
" # reusing the existing result:\n",
|
||||||
|
" parent_res = super().serialize(\n",
|
||||||
|
" item=item,\n",
|
||||||
|
" doc_serializer=doc_serializer,\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" **kwargs,\n",
|
||||||
|
" )\n",
|
||||||
|
" text_parts.append(parent_res.text)\n",
|
||||||
|
"\n",
|
||||||
|
" # appending annotations:\n",
|
||||||
|
" for annotation in item.annotations:\n",
|
||||||
|
" if isinstance(annotation, PictureDescriptionData):\n",
|
||||||
|
" text_parts.append(f\"<!-- Picture description: {annotation.text} -->\")\n",
|
||||||
|
"\n",
|
||||||
|
" text_res = (separator or \"\\n\").join(text_parts)\n",
|
||||||
|
" return create_ser_result(text=text_res, span_source=item)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Last but not least, we define a new doc serializer which leverages our custom picture\n",
|
||||||
|
"serializer.\n",
|
||||||
|
"\n",
|
||||||
|
"Notice the picture description annotations in the output below:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | Report | Question | Answer | │\n",
|
||||||
|
"│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------| │\n",
|
||||||
|
"│ | IBM 2022 | How many hours were spent on employee learning in 2021? | 22.5 million hours | │\n",
|
||||||
|
"│ | IBM 2022 | What was the rate of fatalities in 2021? | The rate of fatalities in 2021 was 0.0016. | │\n",
|
||||||
|
"│ | IBM 2022 | How many full audits were con- ducted in 2022 in India? | 2 | │\n",
|
||||||
|
"│ | Starbucks 2022 | What is the percentage of women in the Board of Directors? | 25% | │\n",
|
||||||
|
"│ | Starbucks 2022 | What was the total energy con- sumption in 2021? | According to the table, the total energy consumption in 2021 was 2,491,543 MWh. | │\n",
|
||||||
|
"│ | Starbucks 2022 | How much packaging material was made from renewable mate- rials? | According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22. | │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ## Related Work │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ The DocQA integrates multiple AI technologies, namely: │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
|
||||||
|
"│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al. │\n",
|
||||||
|
"│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning- │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline. │\n",
|
||||||
|
"│ <!-- Picture description: The image depicts a document conversion process. It is a sequence of steps that includes document conversion, information retrieval, and response generation. The document │\n",
|
||||||
|
"│ conversion step involves converting the document from a text format to a markdown format. The information retrieval step involves retrieving the document from a database or other source. The response │\n",
|
||||||
|
"│ generation step involves generating a response from the information retrieval step. --> │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018). │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Copyright © 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | Report | Question | Answer | │\n",
|
||||||
|
"│ |----------------|------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------| │\n",
|
||||||
|
"│ | IBM 2022 | How many hours were spent on employee learning in 2021? | 22.5 million hours | │\n",
|
||||||
|
"│ | IBM 2022 | What was the rate of fatalities in 2021? | The rate of fatalities in 2021 was 0.0016. | │\n",
|
||||||
|
"│ | IBM 2022 | How many full audits were con- ducted in 2022 in India? | 2 | │\n",
|
||||||
|
"│ | Starbucks 2022 | What is the percentage of women in the Board of Directors? | 25% | │\n",
|
||||||
|
"│ | Starbucks 2022 | What was the total energy con- sumption in 2021? | According to the table, the total energy consumption in 2021 was 2,491,543 MWh. | │\n",
|
||||||
|
"│ | Starbucks 2022 | How much packaging material was made from renewable mate- rials? | According to the given data, 31% of packaging materials were made from recycled or renewable materials in FY22. | │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Table 1: Example question answers from the ESG reports of IBM and Starbucks using Deep Search DocQA system. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ESG report in our library via our QA conversational assistant. Our assistant generates answers and also presents the information (paragraph or table), in the ESG report, from which it has generated the │\n",
|
||||||
|
"│ response. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ ## Related Work │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ The DocQA integrates multiple AI technologies, namely: │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Document Conversion: Converting unstructured documents, such as PDF files, into a machine-readable format is a challenging task in AI. Early strategies for document conversion were based on geometric layout │\n",
|
||||||
|
"│ analysis (Cattoni et al. 2000; Breuel 2002). Thanks to the availability of large annotated datasets (PubLayNet (Zhong et al. 2019), DocBank (Li et al. 2020), DocLayNet (Pfitzmann et al. 2022; Auer et al. │\n",
|
||||||
|
"│ 2023), deep learning-based methods are routinely used. Modern approaches for recovering the structure of a document can be broadly divided into two categories: image-based or PDF representation-based . │\n",
|
||||||
|
"│ Imagebased methods usually employ Transformer or CNN architectures on the images of pages (Zhang et al. 2023; Li et al. 2022; Huang et al. 2022). On the other hand, deep learning- │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Figure 1: System architecture: Simplified sketch of document question-answering pipeline. │\n",
|
||||||
|
"│ <!-- Picture description: The image depicts a document conversion process. It is a sequence of steps that includes document conversion, information retrieval, and response generation. The document │\n",
|
||||||
|
"│ conversion step involves converting the document from a text format to a markdown format. The information retrieval step involves retrieving the document from a database or other source. The response │\n",
|
||||||
|
"│ generation step involves generating a response from the information retrieval step. --> │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ based language processing methods are applied on the native PDF content (generated by a single PDF printing command) (Auer et al. 2022; Livathinos et al. 2021; Staar et al. 2018). │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"serializer = MarkdownDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" picture_serializer=AnnotationPictureSerializer(),\n",
|
||||||
|
" params=MarkdownParams(\n",
|
||||||
|
" image_mode=ImageRefMode.PLACEHOLDER,\n",
|
||||||
|
" image_placeholder=\"\",\n",
|
||||||
|
" ),\n",
|
||||||
|
")\n",
|
||||||
|
"ser_result = serializer.serialize()\n",
|
||||||
|
"ser_text = ser_result.text\n",
|
||||||
|
"\n",
|
||||||
|
"print_in_console(ser_text[ser_text.find(start_cue) : ser_text.find(stop_cue)])"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
10
docs/integrations/data_prep_kit.md
vendored
10
docs/integrations/data_prep_kit.md
vendored
@ -1,10 +1,10 @@
|
|||||||
Docling is used by the [Data Prep Kit](https://ibm.github.io/data-prep-kit/) open-source toolkit for preparing unstructured data for LLM application development ranging from laptop scale to datacenter scale.
|
Docling is used by the [Data Prep Kit](https://data-prep-kit.github.io/data-prep-kit/) open-source toolkit for preparing unstructured data for LLM application development ranging from laptop scale to datacenter scale.
|
||||||
|
|
||||||
## Components
|
## Components
|
||||||
### PDF ingestion to Parquet
|
### PDF ingestion to Parquet
|
||||||
- 💻 [PDF-to-Parquet GitHub](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet)
|
- 💻 [Docling2Parquet source](https://github.com/data-prep-kit/data-prep-kit/tree/dev/transforms/language/docling2parquet)
|
||||||
- 📖 [PDF-to-Parquet docs](https://ibm.github.io/data-prep-kit/transforms/language/pdf2parquet/python/)
|
- 📖 [Docling2Parquet docs](https://data-prep-kit.github.io/data-prep-kit/transforms/language/pdf2parquet/)
|
||||||
|
|
||||||
### Document chunking
|
### Document chunking
|
||||||
- 💻 [Doc Chunking GitHub](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_chunk)
|
- 💻 [Doc Chunking source](https://github.com/data-prep-kit/data-prep-kit/tree/dev/transforms/language/doc_chunk)
|
||||||
- 📖 [Doc Chunking docs](https://ibm.github.io/data-prep-kit/transforms/language/doc_chunk/python/)
|
- 📖 [Doc Chunking docs](https://data-prep-kit.github.io/data-prep-kit/transforms/language/doc_chunk/)
|
||||||
|
2
docs/usage/supported_formats.md
vendored
2
docs/usage/supported_formats.md
vendored
@ -14,7 +14,7 @@ Below you can find a listing of all supported input and output formats.
|
|||||||
| AsciiDoc | |
|
| AsciiDoc | |
|
||||||
| HTML, XHTML | |
|
| HTML, XHTML | |
|
||||||
| CSV | |
|
| CSV | |
|
||||||
| PNG, JPEG, TIFF, BMP | Image formats |
|
| PNG, JPEG, TIFF, BMP, WEBP | Image formats |
|
||||||
|
|
||||||
Schema-specific support:
|
Schema-specific support:
|
||||||
|
|
||||||
|
@ -66,6 +66,7 @@ nav:
|
|||||||
- Concepts: concepts/index.md
|
- Concepts: concepts/index.md
|
||||||
- Architecture: concepts/architecture.md
|
- Architecture: concepts/architecture.md
|
||||||
- Docling Document: concepts/docling_document.md
|
- Docling Document: concepts/docling_document.md
|
||||||
|
- Serialization: concepts/serialization.md
|
||||||
- Chunking: concepts/chunking.md
|
- Chunking: concepts/chunking.md
|
||||||
- Plugins: concepts/plugins.md
|
- Plugins: concepts/plugins.md
|
||||||
- Examples:
|
- Examples:
|
||||||
@ -87,8 +88,10 @@ nav:
|
|||||||
- "Simple translation": examples/translate.py
|
- "Simple translation": examples/translate.py
|
||||||
- examples/backend_csv.ipynb
|
- examples/backend_csv.ipynb
|
||||||
- examples/backend_xml_rag.ipynb
|
- examples/backend_xml_rag.ipynb
|
||||||
- ✂️ Chunking:
|
- ✂️ Serialization & chunking:
|
||||||
|
- examples/serialization.ipynb
|
||||||
- examples/hybrid_chunking.ipynb
|
- examples/hybrid_chunking.ipynb
|
||||||
|
- examples/advanced_chunking_and_serialization.ipynb
|
||||||
- 🤖 RAG with AI dev frameworks:
|
- 🤖 RAG with AI dev frameworks:
|
||||||
- examples/rag_haystack.ipynb
|
- examples/rag_haystack.ipynb
|
||||||
- examples/rag_langchain.ipynb
|
- examples/rag_langchain.ipynb
|
||||||
|
1413
poetry.lock
generated
1413
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.31.0" # DO NOT EDIT, updated automatically
|
version = "2.34.0" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = [
|
authors = [
|
||||||
"Christoph Auer <cau@zurich.ibm.com>",
|
"Christoph Auer <cau@zurich.ibm.com>",
|
||||||
@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
|
|||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = {version = "^2.26.0", extras = ["chunking"]}
|
docling-core = {version = "^2.31.2", extras = ["chunking"]}
|
||||||
|
|
||||||
# Nikos: Pinpoint to the docling-ibm-models@nli/layout_dfine
|
# Nikos: Pinpoint to the docling-ibm-models@nli/layout_dfine
|
||||||
# docling-ibm-models = "^3.4.0"
|
# docling-ibm-models = "^3.4.0"
|
||||||
@ -97,6 +97,7 @@ pillow = ">=10.0.0,<12.0.0"
|
|||||||
tqdm = "^4.65.0"
|
tqdm = "^4.65.0"
|
||||||
pluggy = "^1.0.0"
|
pluggy = "^1.0.0"
|
||||||
pylatexenc = "^2.10"
|
pylatexenc = "^2.10"
|
||||||
|
click = "<8.2.0"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
python = "^3.9.2"
|
python = "^3.9.2"
|
||||||
|
BIN
tests/data/docx/textbox.docx
vendored
Normal file
BIN
tests/data/docx/textbox.docx
vendored
Normal file
Binary file not shown.
@ -23,6 +23,7 @@
|
|||||||
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
||||||
<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
|
<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_50><loc_29><loc_89><loc_35></location>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
|
||||||
<table>
|
<table>
|
||||||
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
||||||
<row_0><col_0><body>0</col_0><col_1><body>1 2 1</col_1><col_2><body>1 2 1</col_2><col_3><body>1 2 1</col_3><col_4><body>1 2 1</col_4></row_0>
|
<row_0><col_0><body>0</col_0><col_1><body>1 2 1</col_1><col_2><body>1 2 1</col_2><col_3><body>1 2 1</col_3><col_4><body>1 2 1</col_4></row_0>
|
||||||
@ -57,6 +58,7 @@
|
|||||||
<location><page_3><loc_51><loc_68><loc_90><loc_90></location>
|
<location><page_3><loc_51><loc_68><loc_90><loc_90></location>
|
||||||
<caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
|
<caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_3><loc_50><loc_64><loc_89><loc_66></location>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
|
||||||
<paragraph><location><page_3><loc_50><loc_59><loc_71><loc_60></location>balance in the previous datasets.</paragraph>
|
<paragraph><location><page_3><loc_50><loc_59><loc_71><loc_60></location>balance in the previous datasets.</paragraph>
|
||||||
<paragraph><location><page_3><loc_50><loc_21><loc_89><loc_58></location>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</paragraph>
|
<paragraph><location><page_3><loc_50><loc_21><loc_89><loc_58></location>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</paragraph>
|
||||||
<paragraph><location><page_3><loc_50><loc_10><loc_89><loc_20></location>Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small</paragraph>
|
<paragraph><location><page_3><loc_50><loc_10><loc_89><loc_20></location>Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small</paragraph>
|
||||||
@ -88,10 +90,12 @@
|
|||||||
<location><page_5><loc_12><loc_77><loc_85><loc_90></location>
|
<location><page_5><loc_12><loc_77><loc_85><loc_90></location>
|
||||||
<caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
|
<caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_5><loc_8><loc_72><loc_89><loc_74></location>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_5><loc_9><loc_36><loc_47><loc_67></location>
|
<location><page_5><loc_9><loc_36><loc_47><loc_67></location>
|
||||||
<caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
|
<caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_5><loc_8><loc_14><loc_47><loc_33></location>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
|
||||||
<paragraph><location><page_5><loc_50><loc_63><loc_89><loc_68></location>forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .</paragraph>
|
<paragraph><location><page_5><loc_50><loc_63><loc_89><loc_68></location>forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .</paragraph>
|
||||||
<paragraph><location><page_5><loc_50><loc_48><loc_89><loc_62></location>Structure Decoder. The transformer architecture of this component is based on the work proposed in [31]. After extensive experimentation, the Structure Decoder is modeled as a transformer encoder with two encoder layers and a transformer decoder made from a stack of 4 decoder layers that comprise mainly of multi-head attention and feed forward layers. This configuration uses fewer layers and heads in comparison to networks applied to other problems (e.g. "Scene Understanding", "Image Captioning"), something which we relate to the simplicity of table images.</paragraph>
|
<paragraph><location><page_5><loc_50><loc_48><loc_89><loc_62></location>Structure Decoder. The transformer architecture of this component is based on the work proposed in [31]. After extensive experimentation, the Structure Decoder is modeled as a transformer encoder with two encoder layers and a transformer decoder made from a stack of 4 decoder layers that comprise mainly of multi-head attention and feed forward layers. This configuration uses fewer layers and heads in comparison to networks applied to other problems (e.g. "Scene Understanding", "Image Captioning"), something which we relate to the simplicity of table images.</paragraph>
|
||||||
<paragraph><location><page_5><loc_50><loc_31><loc_89><loc_47></location>The transformer encoder receives an encoded image from the CNN Backbone Network and refines it through a multi-head dot-product attention layer, followed by a Feed Forward Network. During training, the transformer decoder receives as input the output feature produced by the transformer encoder, and the tokenized input of the HTML ground-truth tags. Using a stack of multi-head attention layers, different aspects of the tag sequence could be inferred. This is achieved by each attention head on a layer operating in a different subspace, and then combining altogether their attention score.</paragraph>
|
<paragraph><location><page_5><loc_50><loc_31><loc_89><loc_47></location>The transformer encoder receives an encoded image from the CNN Backbone Network and refines it through a multi-head dot-product attention layer, followed by a Feed Forward Network. During training, the transformer decoder receives as input the output feature produced by the transformer encoder, and the tokenized input of the HTML ground-truth tags. Using a stack of multi-head attention layers, different aspects of the tag sequence could be inferred. This is achieved by each attention head on a layer operating in a different subspace, and then combining altogether their attention score.</paragraph>
|
||||||
@ -167,6 +171,7 @@
|
|||||||
<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
|
<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
|
||||||
<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
|
<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
|
||||||
<table>
|
<table>
|
||||||
<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
|
<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
|
||||||
<caption>Text is aligned to match original for ease of viewing</caption>
|
<caption>Text is aligned to match original for ease of viewing</caption>
|
||||||
@ -196,10 +201,12 @@
|
|||||||
<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
|
<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
|
||||||
<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
|
<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_10><loc_41><loc_87><loc_42></location>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
|
<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
|
||||||
<caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
|
<caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_8><loc_54><loc_89><loc_59></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_8><loc_63><loc_44><loc_89><loc_52></location>
|
<location><page_8><loc_63><loc_44><loc_89><loc_52></location>
|
||||||
</figure>
|
</figure>
|
||||||
@ -269,6 +276,7 @@
|
|||||||
<location><page_12><loc_9><loc_81><loc_89><loc_91></location>
|
<location><page_12><loc_9><loc_81><loc_89><loc_91></location>
|
||||||
<caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
<caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_12><loc_8><loc_76><loc_89><loc_79></location>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
||||||
<paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>- · TableFormer output does not include the table cell content.</paragraph>
|
<paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>- · TableFormer output does not include the table cell content.</paragraph>
|
||||||
<paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>- · There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
|
<paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>- · There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
|
||||||
<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
|
<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
|
||||||
@ -373,6 +381,7 @@
|
|||||||
<location><page_14><loc_52><loc_55><loc_87><loc_89></location>
|
<location><page_14><loc_52><loc_55><loc_87><loc_89></location>
|
||||||
<caption>Figure 13: Table predictions example on colorful table.</caption>
|
<caption>Figure 13: Table predictions example on colorful table.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_14><loc_52><loc_52><loc_88><loc_53></location>Figure 13: Table predictions example on colorful table.</caption>
|
||||||
<table>
|
<table>
|
||||||
<location><page_14><loc_52><loc_40><loc_85><loc_46></location>
|
<location><page_14><loc_52><loc_40><loc_85><loc_46></location>
|
||||||
<caption>Figure 14: Example with multi-line text.</caption>
|
<caption>Figure 14: Example with multi-line text.</caption>
|
||||||
@ -433,4 +442,5 @@
|
|||||||
<location><page_16><loc_11><loc_37><loc_86><loc_68></location>
|
<location><page_16><loc_11><loc_37><loc_86><loc_68></location>
|
||||||
<caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
|
<caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_16><loc_8><loc_33><loc_89><loc_36></location>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
|
||||||
</document>
|
</document>
|
230
tests/data/groundtruth/docling_v1/2203.01017v2.json
vendored
230
tests/data/groundtruth/docling_v1/2203.01017v2.json
vendored
@ -365,6 +365,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/2"
|
"$ref": "#/figures/2"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
308.862,
|
||||||
|
232.72709999999995,
|
||||||
|
545.11517,
|
||||||
|
277.49963
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
220
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Table",
|
"name": "Table",
|
||||||
"type": "table",
|
"type": "table",
|
||||||
@ -904,6 +927,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/3"
|
"$ref": "#/figures/3"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
308.862,
|
||||||
|
503.3020900000001,
|
||||||
|
545.11511,
|
||||||
|
524.16364
|
||||||
|
],
|
||||||
|
"page": 3,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
104
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1282,11 +1328,57 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/4"
|
"$ref": "#/figures/4"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.111992,
|
||||||
|
567.03308,
|
||||||
|
545.10846,
|
||||||
|
588.01422
|
||||||
|
],
|
||||||
|
"page": 5,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
212
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/5"
|
"$ref": "#/figures/5"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
111.72906,
|
||||||
|
286.36597,
|
||||||
|
264.2171900000001
|
||||||
|
],
|
||||||
|
"page": 5,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
745
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2214,6 +2306,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/7"
|
"$ref": "#/figures/7"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
53.811783000000005,
|
||||||
|
575.89355,
|
||||||
|
385.93451,
|
||||||
|
583.76672
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
79
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "b. Structure predicted by TableFormer, with superimposed matched PDF cell text:",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Table",
|
"name": "Table",
|
||||||
"type": "table",
|
"type": "table",
|
||||||
@ -2252,11 +2367,57 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/8"
|
"$ref": "#/figures/8"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
62.595001,
|
||||||
|
324.36508,
|
||||||
|
532.63049,
|
||||||
|
333.27164
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
112
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/9"
|
"$ref": "#/figures/9"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
426.35013,
|
||||||
|
545.11377,
|
||||||
|
471.12265
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
397
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
@ -3707,6 +3868,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/11"
|
"$ref": "#/figures/11"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
605.63605,
|
||||||
|
545.11371,
|
||||||
|
626.49762
|
||||||
|
],
|
||||||
|
"page": 12,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
245
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -4517,6 +4701,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/16"
|
"$ref": "#/figures/16"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
315.79001,
|
||||||
|
411.40909,
|
||||||
|
538.18524,
|
||||||
|
420.31564
|
||||||
|
],
|
||||||
|
"page": 14,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
55
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 13: Table predictions example on colorful table.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Table",
|
"name": "Table",
|
||||||
"type": "table",
|
"type": "table",
|
||||||
@ -4675,6 +4882,29 @@
|
|||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/23"
|
"$ref": "#/figures/23"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
262.80108999999993,
|
||||||
|
545.11383,
|
||||||
|
283.66263
|
||||||
|
],
|
||||||
|
"page": 16,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
153
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"figures": [
|
"figures": [
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
<location><page_1><loc_53><loc_34><loc_90><loc_68></location>
|
<location><page_1><loc_53><loc_34><loc_90><loc_68></location>
|
||||||
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
|
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_52><loc_29><loc_91><loc_32></location>Figure 1: Four examples of complex page layouts across different document categories</caption>
|
||||||
<subtitle-level-1><location><page_1><loc_52><loc_24><loc_62><loc_25></location>KEYWORDS</subtitle-level-1>
|
<subtitle-level-1><location><page_1><loc_52><loc_24><loc_62><loc_25></location>KEYWORDS</subtitle-level-1>
|
||||||
<paragraph><location><page_1><loc_52><loc_21><loc_91><loc_23></location>PDF document conversion, layout segmentation, object-detection, data set, Machine Learning</paragraph>
|
<paragraph><location><page_1><loc_52><loc_21><loc_91><loc_23></location>PDF document conversion, layout segmentation, object-detection, data set, Machine Learning</paragraph>
|
||||||
<subtitle-level-1><location><page_1><loc_52><loc_18><loc_66><loc_19></location>ACM Reference Format:</subtitle-level-1>
|
<subtitle-level-1><location><page_1><loc_52><loc_18><loc_66><loc_19></location>ACM Reference Format:</subtitle-level-1>
|
||||||
@ -44,6 +45,7 @@
|
|||||||
<location><page_3><loc_14><loc_72><loc_43><loc_88></location>
|
<location><page_3><loc_14><loc_72><loc_43><loc_88></location>
|
||||||
<caption>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
|
<caption>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_3><loc_9><loc_68><loc_48><loc_70></location>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
|
||||||
<paragraph><location><page_3><loc_9><loc_54><loc_48><loc_64></location>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".</paragraph>
|
<paragraph><location><page_3><loc_9><loc_54><loc_48><loc_64></location>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".</paragraph>
|
||||||
<paragraph><location><page_3><loc_9><loc_36><loc_48><loc_53></location>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</paragraph>
|
<paragraph><location><page_3><loc_9><loc_36><loc_48><loc_53></location>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</paragraph>
|
||||||
<paragraph><location><page_3><loc_9><loc_23><loc_48><loc_35></location>We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.</paragraph>
|
<paragraph><location><page_3><loc_9><loc_23><loc_48><loc_35></location>We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.</paragraph>
|
||||||
@ -76,6 +78,7 @@
|
|||||||
<location><page_4><loc_9><loc_32><loc_48><loc_61></location>
|
<location><page_4><loc_9><loc_32><loc_48><loc_61></location>
|
||||||
<caption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
|
<caption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_4><loc_9><loc_23><loc_48><loc_30></location>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
|
||||||
<paragraph><location><page_4><loc_9><loc_15><loc_48><loc_20></location>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</paragraph>
|
<paragraph><location><page_4><loc_9><loc_15><loc_48><loc_20></location>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</paragraph>
|
||||||
<paragraph><location><page_4><loc_9><loc_11><loc_48><loc_14></location><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</paragraph>
|
<paragraph><location><page_4><loc_9><loc_11><loc_48><loc_14></location><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</paragraph>
|
||||||
<paragraph><location><page_4><loc_52><loc_36><loc_91><loc_52></location>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</paragraph>
|
<paragraph><location><page_4><loc_52><loc_36><loc_91><loc_52></location>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</paragraph>
|
||||||
@ -123,6 +126,7 @@
|
|||||||
<location><page_6><loc_53><loc_67><loc_90><loc_89></location>
|
<location><page_6><loc_53><loc_67><loc_90><loc_89></location>
|
||||||
<caption>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
|
<caption>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_6><loc_52><loc_57><loc_91><loc_65></location>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
|
||||||
<paragraph><location><page_6><loc_52><loc_49><loc_91><loc_52></location>paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.</paragraph>
|
<paragraph><location><page_6><loc_52><loc_49><loc_91><loc_52></location>paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.</paragraph>
|
||||||
<paragraph><location><page_6><loc_52><loc_39><loc_91><loc_49></location>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</paragraph>
|
<paragraph><location><page_6><loc_52><loc_39><loc_91><loc_49></location>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</paragraph>
|
||||||
<subtitle-level-1><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</subtitle-level-1>
|
<subtitle-level-1><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</subtitle-level-1>
|
||||||
@ -216,6 +220,7 @@
|
|||||||
<location><page_9><loc_9><loc_44><loc_91><loc_89></location>
|
<location><page_9><loc_9><loc_44><loc_91><loc_89></location>
|
||||||
<caption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
|
<caption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_9><loc_10><loc_43><loc_52><loc_44></location>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
|
||||||
<paragraph><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</paragraph>
|
<paragraph><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</paragraph>
|
||||||
<paragraph><location><page_9><loc_11><loc_31><loc_48><loc_33></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</paragraph>
|
<paragraph><location><page_9><loc_11><loc_31><loc_48><loc_33></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</paragraph>
|
||||||
<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_33></location>- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
|
<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_33></location>- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
|
||||||
|
115
tests/data/groundtruth/docling_v1/2206.01062.json
vendored
115
tests/data/groundtruth/docling_v1/2206.01062.json
vendored
@ -430,6 +430,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
317.95499,
|
||||||
|
232.48476000000005,
|
||||||
|
559.80579,
|
||||||
|
251.91701
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
84
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1: Four examples of complex page layouts across different document categories",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -964,6 +987,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
53.79800000000001,
|
||||||
|
536.45276,
|
||||||
|
294.04373,
|
||||||
|
555.88501
|
||||||
|
],
|
||||||
|
"page": 3,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
69
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 2: Distribution of DocLayNet pages across document categories.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1227,6 +1273,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/2"
|
"$ref": "#/figures/2"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
53.79800000000001,
|
||||||
|
185.68075999999996,
|
||||||
|
295.64874,
|
||||||
|
237.99000999999998
|
||||||
|
],
|
||||||
|
"page": 4,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
281
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1808,6 +1877,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/4"
|
"$ref": "#/figures/4"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
317.95499,
|
||||||
|
449.71581999999995,
|
||||||
|
559.80579,
|
||||||
|
512.98401
|
||||||
|
],
|
||||||
|
"page": 6,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
329
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2702,6 +2794,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/5"
|
"$ref": "#/figures/5"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
62.323874999999994,
|
||||||
|
343.73517,
|
||||||
|
318.50473,
|
||||||
|
349.71457
|
||||||
|
],
|
||||||
|
"page": 9,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
89
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -213,10 +213,10 @@
|
|||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
"bbox": [
|
"bbox": [
|
||||||
139.66741943359375,
|
139.66746520996094,
|
||||||
322.5054626464844,
|
322.5054626464844,
|
||||||
475.00927734375,
|
475.0093078613281,
|
||||||
454.45458984375
|
454.4546203613281
|
||||||
],
|
],
|
||||||
"page": 1,
|
"page": 1,
|
||||||
"span": [
|
"span": [
|
||||||
|
@ -2646,7 +2646,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373531937599182,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -2686,7 +2686,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.8858679533004761,
|
"confidence": 0.8858677744865417,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 1,
|
"index": 1,
|
||||||
@ -2726,7 +2726,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -3096,7 +3096,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591910243034363,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -3280,9 +3280,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -7787,7 +7787,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -8184,9 +8184,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -13582,7 +13582,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373531937599182,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -13628,7 +13628,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.8858679533004761,
|
"confidence": 0.8858677744865417,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 1,
|
"index": 1,
|
||||||
@ -13674,7 +13674,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -14062,7 +14062,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591910243034363,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -14252,9 +14252,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -19642,7 +19642,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -20057,7 +20057,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -20445,7 +20445,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591910243034363,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -20635,9 +20635,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -26025,7 +26025,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -26440,7 +26440,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373531937599182,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -26486,7 +26486,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.8858679533004761,
|
"confidence": 0.8858677744865417,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 1,
|
"index": 1,
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
<location><page_2><loc_24><loc_46><loc_76><loc_74></location>
|
<location><page_2><loc_24><loc_46><loc_76><loc_74></location>
|
||||||
<caption>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
|
<caption>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_2><loc_22><loc_75><loc_79><loc_84></location>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
|
||||||
<paragraph><location><page_2><loc_22><loc_34><loc_79><loc_43></location>today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].</paragraph>
|
<paragraph><location><page_2><loc_22><loc_34><loc_79><loc_43></location>today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].</paragraph>
|
||||||
<paragraph><location><page_2><loc_22><loc_16><loc_79><loc_34></location>Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTabNet [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR.</paragraph>
|
<paragraph><location><page_2><loc_22><loc_16><loc_79><loc_34></location>Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTabNet [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR.</paragraph>
|
||||||
<paragraph><location><page_3><loc_22><loc_73><loc_79><loc_85></location>While the majority of research in TSR is currently focused on the development and application of novel neural model architectures, the table structure representation language (e.g. HTML in PubTabNet and FinTabNet) is usually adopted as is for the sequence tokenization in Im2Seq models. In this paper, we aim for the opposite and investigate the impact of the table structure representation language with an otherwise unmodified Im2Seq transformer-based architecture. Since the current state-of-the-art Im2Seq model is TableFormer [9], we select this model to perform our experiments.</paragraph>
|
<paragraph><location><page_3><loc_22><loc_73><loc_79><loc_85></location>While the majority of research in TSR is currently focused on the development and application of novel neural model architectures, the table structure representation language (e.g. HTML in PubTabNet and FinTabNet) is usually adopted as is for the sequence tokenization in Im2Seq models. In this paper, we aim for the opposite and investigate the impact of the table structure representation language with an otherwise unmodified Im2Seq transformer-based architecture. Since the current state-of-the-art Im2Seq model is TableFormer [9], we select this model to perform our experiments.</paragraph>
|
||||||
@ -30,6 +31,7 @@
|
|||||||
<location><page_5><loc_22><loc_57><loc_78><loc_71></location>
|
<location><page_5><loc_22><loc_57><loc_78><loc_71></location>
|
||||||
<caption>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
|
<caption>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_5><loc_24><loc_71><loc_77><loc_72></location>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
|
||||||
<paragraph><location><page_5><loc_22><loc_33><loc_79><loc_54></location>Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.</paragraph>
|
<paragraph><location><page_5><loc_22><loc_33><loc_79><loc_54></location>Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.</paragraph>
|
||||||
<paragraph><location><page_5><loc_22><loc_27><loc_79><loc_32></location>Additionally, it would be desirable if the representation would easily allow an early detection of invalid sequences on-the-go, before the prediction of the entire table structure is completed. HTML is not well-suited for this purpose as the verification of incomplete sequences is non-trivial or even impossible.</paragraph>
|
<paragraph><location><page_5><loc_22><loc_27><loc_79><loc_32></location>Additionally, it would be desirable if the representation would easily allow an early detection of invalid sequences on-the-go, before the prediction of the entire table structure is completed. HTML is not well-suited for this purpose as the verification of incomplete sequences is non-trivial or even impossible.</paragraph>
|
||||||
<paragraph><location><page_5><loc_22><loc_16><loc_79><loc_26></location>In a valid HTML table, the token sequence must describe a 2D grid of table cells, serialised in row-major ordering, where each row and each column have the same length (while considering row- and column-spans). Furthermore, every opening tag in HTML needs to be matched by a closing tag in a correct hierarchical manner. Since the number of tokens for each table row and column can vary significantly, especially for large tables with many row- and column-spans, it is complex to verify the consistency of predicted structures during sequence</paragraph>
|
<paragraph><location><page_5><loc_22><loc_16><loc_79><loc_26></location>In a valid HTML table, the token sequence must describe a 2D grid of table cells, serialised in row-major ordering, where each row and each column have the same length (while considering row- and column-spans). Furthermore, every opening tag in HTML needs to be matched by a closing tag in a correct hierarchical manner. Since the number of tokens for each table row and column can vary significantly, especially for large tables with many row- and column-spans, it is complex to verify the consistency of predicted structures during sequence</paragraph>
|
||||||
@ -50,6 +52,7 @@
|
|||||||
<location><page_7><loc_27><loc_65><loc_73><loc_79></location>
|
<location><page_7><loc_27><loc_65><loc_73><loc_79></location>
|
||||||
<caption>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
|
<caption>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_7><loc_22><loc_80><loc_79><loc_84></location>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
|
||||||
<subtitle-level-1><location><page_7><loc_22><loc_60><loc_40><loc_61></location>4.2 Language Syntax</subtitle-level-1>
|
<subtitle-level-1><location><page_7><loc_22><loc_60><loc_40><loc_61></location>4.2 Language Syntax</subtitle-level-1>
|
||||||
<paragraph><location><page_7><loc_22><loc_58><loc_59><loc_59></location>The OTSL representation follows these syntax rules:</paragraph>
|
<paragraph><location><page_7><loc_22><loc_58><loc_59><loc_59></location>The OTSL representation follows these syntax rules:</paragraph>
|
||||||
<paragraph><location><page_7><loc_23><loc_54><loc_79><loc_56></location>- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</paragraph>
|
<paragraph><location><page_7><loc_23><loc_54><loc_79><loc_56></location>- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</paragraph>
|
||||||
@ -70,6 +73,7 @@
|
|||||||
<location><page_8><loc_23><loc_25><loc_77><loc_36></location>
|
<location><page_8><loc_23><loc_25><loc_77><loc_36></location>
|
||||||
<caption>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
|
<caption>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_22><loc_36><loc_79><loc_39></location>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
|
||||||
<paragraph><location><page_8><loc_22><loc_16><loc_79><loc_22></location>We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in</paragraph>
|
<paragraph><location><page_8><loc_22><loc_16><loc_79><loc_22></location>We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in</paragraph>
|
||||||
<paragraph><location><page_9><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</paragraph>
|
<paragraph><location><page_9><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</paragraph>
|
||||||
<subtitle-level-1><location><page_9><loc_22><loc_78><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</subtitle-level-1>
|
<subtitle-level-1><location><page_9><loc_22><loc_78><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</subtitle-level-1>
|
||||||
@ -104,12 +108,14 @@
|
|||||||
<location><page_10><loc_27><loc_16><loc_74><loc_44></location>
|
<location><page_10><loc_27><loc_16><loc_74><loc_44></location>
|
||||||
<caption>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
|
<caption>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_10><loc_22><loc_44><loc_79><loc_50></location>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
|
||||||
<paragraph><location><page_10><loc_37><loc_15><loc_38><loc_16></location>μ</paragraph>
|
<paragraph><location><page_10><loc_37><loc_15><loc_38><loc_16></location>μ</paragraph>
|
||||||
<paragraph><location><page_10><loc_49><loc_12><loc_49><loc_14></location>≥</paragraph>
|
<paragraph><location><page_10><loc_49><loc_12><loc_49><loc_14></location>≥</paragraph>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_11><loc_28><loc_20><loc_73><loc_77></location>
|
<location><page_11><loc_28><loc_20><loc_73><loc_77></location>
|
||||||
<caption>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
|
<caption>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_11><loc_22><loc_78><loc_79><loc_84></location>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
|
||||||
<subtitle-level-1><location><page_12><loc_22><loc_84><loc_36><loc_85></location>6 Conclusion</subtitle-level-1>
|
<subtitle-level-1><location><page_12><loc_22><loc_84><loc_36><loc_85></location>6 Conclusion</subtitle-level-1>
|
||||||
<paragraph><location><page_12><loc_22><loc_74><loc_79><loc_81></location>We demonstrated that representing tables in HTML for the task of table structure recognition with Im2Seq models is ill-suited and has serious limitations. Furthermore, we presented in this paper an Optimized Table Structure Language (OTSL) which, when compared to commonly used general purpose languages, has several key benefits.</paragraph>
|
<paragraph><location><page_12><loc_22><loc_74><loc_79><loc_81></location>We demonstrated that representing tables in HTML for the task of table structure recognition with Im2Seq models is ill-suited and has serious limitations. Furthermore, we presented in this paper an Optimized Table Structure Language (OTSL) which, when compared to commonly used general purpose languages, has several key benefits.</paragraph>
|
||||||
<paragraph><location><page_12><loc_22><loc_59><loc_79><loc_74></location>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</paragraph>
|
<paragraph><location><page_12><loc_22><loc_59><loc_79><loc_74></location>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</paragraph>
|
||||||
|
138
tests/data/groundtruth/docling_v1/2305.03393v1.json
vendored
138
tests/data/groundtruth/docling_v1/2305.03393v1.json
vendored
@ -340,6 +340,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
591.77942,
|
||||||
|
480.59189,
|
||||||
|
665.66583
|
||||||
|
],
|
||||||
|
"page": 2,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
574
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -644,6 +667,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
145.60701,
|
||||||
|
562.78821,
|
||||||
|
469.75223000000005,
|
||||||
|
570.92072
|
||||||
|
],
|
||||||
|
"page": 5,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
73
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1017,6 +1063,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/2"
|
"$ref": "#/figures/2"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
636.15033,
|
||||||
|
480.5874,
|
||||||
|
666.2008100000002
|
||||||
|
],
|
||||||
|
"page": 7,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
207
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1390,6 +1459,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/3"
|
"$ref": "#/figures/3"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.76501,
|
||||||
|
288.26035,
|
||||||
|
480.59082,
|
||||||
|
307.35187
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
104
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1658,6 +1750,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/4"
|
"$ref": "#/figures/4"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
352.28284,
|
||||||
|
480.59106,
|
||||||
|
394.40988
|
||||||
|
],
|
||||||
|
"page": 10,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
270
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). \"PMC2807444_006_00.png\" PubTabNet. \u03bc",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1709,6 +1824,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/5"
|
"$ref": "#/figures/5"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
614.23236,
|
||||||
|
480.58838000000003,
|
||||||
|
666.2008100000002
|
||||||
|
],
|
||||||
|
"page": 11,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
390
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. \"PMC5406406_003_01.png\" PubTabNet.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
<location><page_1><loc_12><loc_10><loc_52><loc_31></location>
|
<location><page_1><loc_12><loc_10><loc_52><loc_31></location>
|
||||||
<caption>Figure 7-26. Self-locking nuts.</caption>
|
<caption>Figure 7-26. Self-locking nuts.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_12><loc_8><loc_31><loc_9></location>Figure 7-26. Self-locking nuts.</caption>
|
||||||
<paragraph><location><page_1><loc_54><loc_85><loc_95><loc_94></location>the most common ranges in size for No. 6 up to 1 / 4 inch, the Rol-top ranges from 1 / 4 inch to 1 / 6 inch, and the bellows type ranges in size from No. 8 up to 3 / 8 inch. Wing-type nuts are made of anodized aluminum alloy, cadmium-plated carbon steel, or stainless steel. The Rol-top nut is cadmium-plated steel, and the bellows type is made of aluminum alloy only.</paragraph>
|
<paragraph><location><page_1><loc_54><loc_85><loc_95><loc_94></location>the most common ranges in size for No. 6 up to 1 / 4 inch, the Rol-top ranges from 1 / 4 inch to 1 / 6 inch, and the bellows type ranges in size from No. 8 up to 3 / 8 inch. Wing-type nuts are made of anodized aluminum alloy, cadmium-plated carbon steel, or stainless steel. The Rol-top nut is cadmium-plated steel, and the bellows type is made of aluminum alloy only.</paragraph>
|
||||||
<paragraph><location><page_1><loc_54><loc_83><loc_55><loc_85></location>.</paragraph>
|
<paragraph><location><page_1><loc_54><loc_83><loc_55><loc_85></location>.</paragraph>
|
||||||
<subtitle-level-1><location><page_1><loc_54><loc_82><loc_76><loc_83></location>Stainless Steel Self-Locking Nut</subtitle-level-1>
|
<subtitle-level-1><location><page_1><loc_54><loc_82><loc_76><loc_83></location>Stainless Steel Self-Locking Nut</subtitle-level-1>
|
||||||
@ -20,4 +21,5 @@
|
|||||||
<location><page_1><loc_54><loc_11><loc_94><loc_46></location>
|
<location><page_1><loc_54><loc_11><loc_94><loc_46></location>
|
||||||
<caption>Figure 7-27. Stainless steel self-locking nut.</caption>
|
<caption>Figure 7-27. Stainless steel self-locking nut.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_54><loc_8><loc_81><loc_10></location>Figure 7-27. Stainless steel self-locking nut.</caption>
|
||||||
</document>
|
</document>
|
@ -206,6 +206,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
72.0,
|
||||||
|
60.99040200000002,
|
||||||
|
184.14828,
|
||||||
|
71.80239900000004
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
31
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 7-26. Self-locking nuts.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -348,6 +371,29 @@
|
|||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
321.0,
|
||||||
|
63.010403,
|
||||||
|
481.64931999999993,
|
||||||
|
73.82240300000001
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
46
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 7-27. Stainless steel self-locking nut.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"figures": [
|
"figures": [
|
||||||
|
55
tests/data/groundtruth/docling_v1/multi_page.doctags.txt
vendored
Normal file
55
tests/data/groundtruth/docling_v1/multi_page.doctags.txt
vendored
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
<document>
|
||||||
|
<subtitle-level-1><location><page_1><loc_12><loc_90><loc_44><loc_91></location>The Evolution of the Word Processor</subtitle-level-1>
|
||||||
|
<paragraph><location><page_1><loc_12><loc_85><loc_84><loc_88></location>The concept of the word processor predates modern computers and has evolved through several technological milestones.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_1><loc_12><loc_81><loc_55><loc_83></location>Pre-Digital Era (19th - Early 20th Century)</subtitle-level-1>
|
||||||
|
<paragraph><location><page_1><loc_12><loc_73><loc_85><loc_80></location>The origins of word processing can be traced back to the invention of the typewriter in the mid-19th century. Patented in 1868 by Christopher Latham Sholes, the typewriter revolutionized written communication by enabling people to produce legible, professional documents more efficiently than handwriting.</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_12><loc_65><loc_85><loc_71></location>During this period, the term "word processing" didn't exist, but the typewriter laid the groundwork for future developments. Over time, advancements such as carbon paper (for copies) and the electric typewriter (introduced by IBM in 1935) improved the speed and convenience of document creation.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_1><loc_12><loc_58><loc_57><loc_60></location>The Birth of Word Processing (1960s - 1970s)</subtitle-level-1>
|
||||||
|
<paragraph><location><page_1><loc_12><loc_52><loc_88><loc_56></location>The term "word processor" first emerged in the 1960s and referred to any system designed to streamline written communication and document production. Early word processors were not software programs but rather standalone machines.</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_15><loc_43><loc_87><loc_50></location>- · IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage.</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_15><loc_38><loc_84><loc_43></location>- · Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time.</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_12><loc_33><loc_86><loc_37></location>These machines were primarily used in offices, where secretarial pools benefited from their ability to make revisions without retyping entire documents.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_1><loc_12><loc_27><loc_52><loc_28></location>The Rise of Personal Computers (1980s)</subtitle-level-1>
|
||||||
|
<paragraph><location><page_1><loc_12><loc_22><loc_87><loc_25></location>The advent of personal computers in the late 1970s and early 1980s transformed word processing from a niche tool to an essential technology for businesses and individuals alike.</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_15><loc_15><loc_88><loc_20></location>- · WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste.</paragraph>
|
||||||
|
<paragraph><location><page_1><loc_15><loc_10><loc_88><loc_15></location>- · Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing.</paragraph>
|
||||||
|
<paragraph><location><page_2><loc_12><loc_87><loc_87><loc_91></location>Other notable software from this era included WordPerfect, which was popular among legal professionals, and Apple's MacWrite, which leveraged the Macintosh's graphical capabilities.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_2><loc_12><loc_80><loc_46><loc_81></location>The Modern Era (1990s - Present)</subtitle-level-1>
|
||||||
|
<paragraph><location><page_2><loc_12><loc_75><loc_86><loc_78></location>By the 1990s, word processing software had become more sophisticated, with features like spell check, grammar check, templates, and collaborative tools.</paragraph>
|
||||||
|
<paragraph><location><page_2><loc_15><loc_70><loc_83><loc_73></location>- · Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint.</paragraph>
|
||||||
|
<paragraph><location><page_2><loc_15><loc_67><loc_87><loc_70></location>- · OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options.</paragraph>
|
||||||
|
<paragraph><location><page_2><loc_15><loc_62><loc_88><loc_67></location>- · Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_2><loc_12><loc_55><loc_39><loc_57></location>Future of Word Processing</subtitle-level-1>
|
||||||
|
<paragraph><location><page_2><loc_12><loc_45><loc_87><loc_53></location>Today, word processors are more than just tools for typing. They integrate artificial intelligence for grammar and style suggestions (e.g., Grammarly), voice-to-text features, and advanced layout options. As AI continues to advance, word processors may evolve into even more intuitive tools that predict user needs, automate repetitive tasks, and support richer multimedia integration.</paragraph>
|
||||||
|
<paragraph><location><page_2><loc_12><loc_35><loc_87><loc_40></location>From the clunky typewriters of the 19th century to the AI-powered cloud tools of today, the word processor has come a long way. It remains an essential tool for communication and creativity, shaping how we write and share ideas.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_3><loc_12><loc_90><loc_46><loc_91></location>Specialized Word Processing Tools</subtitle-level-1>
|
||||||
|
<paragraph><location><page_3><loc_12><loc_83><loc_86><loc_88></location>In addition to general-purpose word processors, specialized tools have emerged to cater to specific industries and needs. These tools incorporate unique features tailored to their users' workflows:</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_73><loc_87><loc_81></location>- · Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing.</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_67><loc_85><loc_73></location>- · Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting.</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_60><loc_88><loc_67></location>- · Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_3><loc_12><loc_53><loc_57><loc_55></location>Key Features That Changed Word Processing</subtitle-level-1>
|
||||||
|
<paragraph><location><page_3><loc_12><loc_47><loc_86><loc_52></location>The evolution of word processors wasn't just about hardware or software improvements-it was about the features that revolutionized how people wrote and edited. Some of these transformative features include:</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_42><loc_86><loc_45></location>- 1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_38><loc_87><loc_42></location>- 2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_35><loc_82><loc_38></location>- 3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_32><loc_84><loc_35></location>- 4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.</paragraph>
|
||||||
|
<paragraph><location><page_3><loc_15><loc_27><loc_88><loc_32></location>- 5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_3><loc_12><loc_20><loc_52><loc_22></location>The Cultural Impact of Word Processors</subtitle-level-1>
|
||||||
|
<paragraph><location><page_3><loc_12><loc_14><loc_87><loc_18></location>The word processor didn't just change workplaces-it changed culture. It democratized writing, enabling anyone with access to a computer to produce professional-quality documents. This shift had profound implications for education, business, and creative fields:</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_87><loc_86><loc_91></location>- · Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_82><loc_88><loc_87></location>- · Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_77><loc_87><loc_82></location>- · Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_4><loc_12><loc_70><loc_50><loc_72></location>Word Processors in a Post-Digital Era</subtitle-level-1>
|
||||||
|
<paragraph><location><page_4><loc_12><loc_67><loc_88><loc_68></location>As we move further into the 21st century, the role of the word processor continues to evolve:</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_58><loc_88><loc_65></location>- 1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_52><loc_86><loc_58></location>- 2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_45><loc_84><loc_52></location>- 3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_40><loc_87><loc_45></location>- 4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_35><loc_86><loc_40></location>- 5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.</paragraph>
|
||||||
|
<subtitle-level-1><location><page_4><loc_12><loc_29><loc_38><loc_30></location>A Glimpse Into the Future</subtitle-level-1>
|
||||||
|
<paragraph><location><page_4><loc_12><loc_24><loc_87><loc_27></location>The word processor's future lies in adaptability and intelligence. Some exciting possibilities include:</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_19><loc_87><loc_22></location>- · Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_14><loc_88><loc_19></location>- · Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments.</paragraph>
|
||||||
|
<paragraph><location><page_4><loc_15><loc_11><loc_87><loc_14></location>- · Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations.</paragraph>
|
||||||
|
<paragraph><location><page_5><loc_12><loc_80><loc_86><loc_88></location>The journey of the word processor-from clunky typewriters to AI-powered platformsreflects humanity's broader technological progress. What began as a tool to simply replace handwriting has transformed into a powerful ally for creativity, communication, and collaboration. As technology continues to advance, the word processor will undoubtedly remain at the heart of how we express ideas and connect with one another.</paragraph>
|
||||||
|
</document>
|
1319
tests/data/groundtruth/docling_v1/multi_page.json
vendored
Normal file
1319
tests/data/groundtruth/docling_v1/multi_page.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
105
tests/data/groundtruth/docling_v1/multi_page.md
vendored
Normal file
105
tests/data/groundtruth/docling_v1/multi_page.md
vendored
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
## The Evolution of the Word Processor
|
||||||
|
|
||||||
|
The concept of the word processor predates modern computers and has evolved through several technological milestones.
|
||||||
|
|
||||||
|
## Pre-Digital Era (19th - Early 20th Century)
|
||||||
|
|
||||||
|
The origins of word processing can be traced back to the invention of the typewriter in the mid-19th century. Patented in 1868 by Christopher Latham Sholes, the typewriter revolutionized written communication by enabling people to produce legible, professional documents more efficiently than handwriting.
|
||||||
|
|
||||||
|
During this period, the term "word processing" didn't exist, but the typewriter laid the groundwork for future developments. Over time, advancements such as carbon paper (for copies) and the electric typewriter (introduced by IBM in 1935) improved the speed and convenience of document creation.
|
||||||
|
|
||||||
|
## The Birth of Word Processing (1960s - 1970s)
|
||||||
|
|
||||||
|
The term "word processor" first emerged in the 1960s and referred to any system designed to streamline written communication and document production. Early word processors were not software programs but rather standalone machines.
|
||||||
|
|
||||||
|
- · IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage.
|
||||||
|
|
||||||
|
- · Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time.
|
||||||
|
|
||||||
|
These machines were primarily used in offices, where secretarial pools benefited from their ability to make revisions without retyping entire documents.
|
||||||
|
|
||||||
|
## The Rise of Personal Computers (1980s)
|
||||||
|
|
||||||
|
The advent of personal computers in the late 1970s and early 1980s transformed word processing from a niche tool to an essential technology for businesses and individuals alike.
|
||||||
|
|
||||||
|
- · WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste.
|
||||||
|
|
||||||
|
- · Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing.
|
||||||
|
|
||||||
|
Other notable software from this era included WordPerfect, which was popular among legal professionals, and Apple's MacWrite, which leveraged the Macintosh's graphical capabilities.
|
||||||
|
|
||||||
|
## The Modern Era (1990s - Present)
|
||||||
|
|
||||||
|
By the 1990s, word processing software had become more sophisticated, with features like spell check, grammar check, templates, and collaborative tools.
|
||||||
|
|
||||||
|
- · Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint.
|
||||||
|
|
||||||
|
- · OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options.
|
||||||
|
|
||||||
|
- · Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work.
|
||||||
|
|
||||||
|
## Future of Word Processing
|
||||||
|
|
||||||
|
Today, word processors are more than just tools for typing. They integrate artificial intelligence for grammar and style suggestions (e.g., Grammarly), voice-to-text features, and advanced layout options. As AI continues to advance, word processors may evolve into even more intuitive tools that predict user needs, automate repetitive tasks, and support richer multimedia integration.
|
||||||
|
|
||||||
|
From the clunky typewriters of the 19th century to the AI-powered cloud tools of today, the word processor has come a long way. It remains an essential tool for communication and creativity, shaping how we write and share ideas.
|
||||||
|
|
||||||
|
## Specialized Word Processing Tools
|
||||||
|
|
||||||
|
In addition to general-purpose word processors, specialized tools have emerged to cater to specific industries and needs. These tools incorporate unique features tailored to their users' workflows:
|
||||||
|
|
||||||
|
- · Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing.
|
||||||
|
|
||||||
|
- · Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting.
|
||||||
|
|
||||||
|
- · Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs.
|
||||||
|
|
||||||
|
## Key Features That Changed Word Processing
|
||||||
|
|
||||||
|
The evolution of word processors wasn't just about hardware or software improvements-it was about the features that revolutionized how people wrote and edited. Some of these transformative features include:
|
||||||
|
|
||||||
|
- 1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.
|
||||||
|
|
||||||
|
- 2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.
|
||||||
|
|
||||||
|
- 3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.
|
||||||
|
|
||||||
|
- 4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.
|
||||||
|
|
||||||
|
- 5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.
|
||||||
|
|
||||||
|
## The Cultural Impact of Word Processors
|
||||||
|
|
||||||
|
The word processor didn't just change workplaces-it changed culture. It democratized writing, enabling anyone with access to a computer to produce professional-quality documents. This shift had profound implications for education, business, and creative fields:
|
||||||
|
|
||||||
|
- · Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities.
|
||||||
|
|
||||||
|
- · Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning.
|
||||||
|
|
||||||
|
- · Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes.
|
||||||
|
|
||||||
|
## Word Processors in a Post-Digital Era
|
||||||
|
|
||||||
|
As we move further into the 21st century, the role of the word processor continues to evolve:
|
||||||
|
|
||||||
|
- 1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.
|
||||||
|
|
||||||
|
- 2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.
|
||||||
|
|
||||||
|
- 3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.
|
||||||
|
|
||||||
|
- 4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.
|
||||||
|
|
||||||
|
- 5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.
|
||||||
|
|
||||||
|
## A Glimpse Into the Future
|
||||||
|
|
||||||
|
The word processor's future lies in adaptability and intelligence. Some exciting possibilities include:
|
||||||
|
|
||||||
|
- · Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input.
|
||||||
|
|
||||||
|
- · Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments.
|
||||||
|
|
||||||
|
- · Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations.
|
||||||
|
|
||||||
|
The journey of the word processor-from clunky typewriters to AI-powered platformsreflects humanity's broader technological progress. What began as a tool to simply replace handwriting has transformed into a powerful ally for creativity, communication, and collaboration. As technology continues to advance, the word processor will undoubtedly remain at the heart of how we express ideas and connect with one another.
|
21968
tests/data/groundtruth/docling_v1/multi_page.pages.json
vendored
Normal file
21968
tests/data/groundtruth/docling_v1/multi_page.pages.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -5,11 +5,13 @@
|
|||||||
<location><page_1><loc_22><loc_36><loc_78><loc_62></location>
|
<location><page_1><loc_22><loc_36><loc_78><loc_62></location>
|
||||||
<caption>Figure 1: This is an example image.</caption>
|
<caption>Figure 1: This is an example image.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_37><loc_32><loc_63><loc_33></location>Figure 1: This is an example image.</caption>
|
||||||
<paragraph><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</paragraph>
|
<paragraph><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</paragraph>
|
||||||
<paragraph><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
|
<paragraph><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_2><loc_36><loc_36><loc_64><loc_65></location>
|
<location><page_2><loc_36><loc_36><loc_64><loc_65></location>
|
||||||
<caption>Figure 2: This is an example image.</caption>
|
<caption>Figure 2: This is an example image.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_2><loc_37><loc_33><loc_63><loc_34></location>Figure 2: This is an example image.</caption>
|
||||||
<paragraph><location><page_2><loc_22><loc_15><loc_78><loc_31></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</paragraph>
|
<paragraph><location><page_2><loc_22><loc_15><loc_78><loc_31></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</paragraph>
|
||||||
</document>
|
</document>
|
@ -96,6 +96,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
226.89101,
|
||||||
|
254.01826000000005,
|
||||||
|
384.3548,
|
||||||
|
262.86505
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
35
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1: This is an example image.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -147,6 +170,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
226.89101,
|
||||||
|
259.94226000000003,
|
||||||
|
384.3548,
|
||||||
|
268.78903
|
||||||
|
],
|
||||||
|
"page": 2,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
35
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 2: This is an example image.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -87,6 +87,7 @@
|
|||||||
<location><page_7><loc_22><loc_13><loc_89><loc_53></location>
|
<location><page_7><loc_22><loc_13><loc_89><loc_53></location>
|
||||||
<caption>Figure 1-2 Existing row and column controls</caption>
|
<caption>Figure 1-2 Existing row and column controls</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_7><loc_22><loc_12><loc_52><loc_13></location>Figure 1-2 Existing row and column controls</caption>
|
||||||
<subtitle-level-1><location><page_8><loc_11><loc_89><loc_55><loc_91></location>2.1.6 Change Function Usage CL command</subtitle-level-1>
|
<subtitle-level-1><location><page_8><loc_11><loc_89><loc_55><loc_91></location>2.1.6 Change Function Usage CL command</subtitle-level-1>
|
||||||
<paragraph><location><page_8><loc_22><loc_87><loc_89><loc_88></location>The following CL commands can be used to work with, display, or change function usage IDs:</paragraph>
|
<paragraph><location><page_8><loc_22><loc_87><loc_89><loc_88></location>The following CL commands can be used to work with, display, or change function usage IDs:</paragraph>
|
||||||
<paragraph><location><page_8><loc_22><loc_84><loc_49><loc_86></location>- GLYPH<SM590000> Work Function Usage ( WRKFCNUSG )</paragraph>
|
<paragraph><location><page_8><loc_22><loc_84><loc_49><loc_86></location>- GLYPH<SM590000> Work Function Usage ( WRKFCNUSG )</paragraph>
|
||||||
@ -150,6 +151,7 @@
|
|||||||
<location><page_10><loc_22><loc_48><loc_89><loc_86></location>
|
<location><page_10><loc_22><loc_48><loc_89><loc_86></location>
|
||||||
<caption>Figure 3-1 CREATE PERMISSION SQL statement</caption>
|
<caption>Figure 3-1 CREATE PERMISSION SQL statement</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_10><loc_22><loc_47><loc_56><loc_48></location>Figure 3-1 CREATE PERMISSION SQL statement</caption>
|
||||||
<subtitle-level-1><location><page_10><loc_22><loc_43><loc_35><loc_44></location>Column mask</subtitle-level-1>
|
<subtitle-level-1><location><page_10><loc_22><loc_43><loc_35><loc_44></location>Column mask</subtitle-level-1>
|
||||||
<paragraph><location><page_10><loc_22><loc_37><loc_89><loc_43></location>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</paragraph>
|
<paragraph><location><page_10><loc_22><loc_37><loc_89><loc_43></location>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</paragraph>
|
||||||
<caption><location><page_11><loc_22><loc_90><loc_67><loc_91></location>Table 3-1 summarizes these special registers and their values.</caption>
|
<caption><location><page_11><loc_22><loc_90><loc_67><loc_91></location>Table 3-1 summarizes these special registers and their values.</caption>
|
||||||
@ -172,6 +174,7 @@
|
|||||||
<location><page_11><loc_22><loc_25><loc_49><loc_51></location>
|
<location><page_11><loc_22><loc_25><loc_49><loc_51></location>
|
||||||
<caption>Figure 3-5 Special registers and adopted authority</caption>
|
<caption>Figure 3-5 Special registers and adopted authority</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_11><loc_22><loc_24><loc_56><loc_25></location>Figure 3-5 Special registers and adopted authority</caption>
|
||||||
<subtitle-level-1><location><page_11><loc_11><loc_20><loc_40><loc_21></location>3.2.2 Built-in global variables</subtitle-level-1>
|
<subtitle-level-1><location><page_11><loc_11><loc_20><loc_40><loc_21></location>3.2.2 Built-in global variables</subtitle-level-1>
|
||||||
<paragraph><location><page_11><loc_22><loc_15><loc_85><loc_18></location>Built-in global variables are provided with the database manager and are used in SQL statements to retrieve scalar values that are associated with the variables.</paragraph>
|
<paragraph><location><page_11><loc_22><loc_15><loc_85><loc_18></location>Built-in global variables are provided with the database manager and are used in SQL statements to retrieve scalar values that are associated with the variables.</paragraph>
|
||||||
<paragraph><location><page_11><loc_22><loc_9><loc_87><loc_13></location>IBM DB2 for i supports nine different built-in global variables that are read only and maintained by the system. These global variables can be used to identify attributes of the database connection and used as part of the RCAC logic.</paragraph>
|
<paragraph><location><page_11><loc_22><loc_9><loc_87><loc_13></location>IBM DB2 for i supports nine different built-in global variables that are read only and maintained by the system. These global variables can be used to identify attributes of the database connection and used as part of the RCAC logic.</paragraph>
|
||||||
@ -215,6 +218,7 @@
|
|||||||
<location><page_14><loc_10><loc_79><loc_89><loc_88></location>
|
<location><page_14><loc_10><loc_79><loc_89><loc_88></location>
|
||||||
<caption>Figure 3-10 Column masks shown in System i Navigator</caption>
|
<caption>Figure 3-10 Column masks shown in System i Navigator</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_14><loc_11><loc_77><loc_48><loc_78></location>Figure 3-10 Column masks shown in System i Navigator</caption>
|
||||||
<subtitle-level-1><location><page_14><loc_11><loc_73><loc_33><loc_74></location>3.6.6 Activating RCAC</subtitle-level-1>
|
<subtitle-level-1><location><page_14><loc_11><loc_73><loc_33><loc_74></location>3.6.6 Activating RCAC</subtitle-level-1>
|
||||||
<paragraph><location><page_14><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</paragraph>
|
<paragraph><location><page_14><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</paragraph>
|
||||||
<paragraph><location><page_14><loc_22><loc_65><loc_67><loc_66></location>- 1. Run the SQL statements that are shown in Example 3-10.</paragraph>
|
<paragraph><location><page_14><loc_22><loc_65><loc_67><loc_66></location>- 1. Run the SQL statements that are shown in Example 3-10.</paragraph>
|
||||||
@ -230,16 +234,19 @@
|
|||||||
<location><page_14><loc_10><loc_18><loc_87><loc_46></location>
|
<location><page_14><loc_10><loc_18><loc_87><loc_46></location>
|
||||||
<caption>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
|
<caption>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_14><loc_11><loc_17><loc_57><loc_18></location>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
|
||||||
<paragraph><location><page_15><loc_22><loc_87><loc_84><loc_91></location>- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</paragraph>
|
<paragraph><location><page_15><loc_22><loc_87><loc_84><loc_91></location>- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</paragraph>
|
||||||
<paragraph><location><page_15><loc_22><loc_32><loc_89><loc_36></location>- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</paragraph>
|
<paragraph><location><page_15><loc_22><loc_32><loc_89><loc_36></location>- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</paragraph>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_15><loc_22><loc_40><loc_89><loc_85></location>
|
<location><page_15><loc_22><loc_40><loc_89><loc_85></location>
|
||||||
<caption>Figure 4-68 Visual Explain with RCAC enabled</caption>
|
<caption>Figure 4-68 Visual Explain with RCAC enabled</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_15><loc_22><loc_38><loc_53><loc_39></location>Figure 4-68 Visual Explain with RCAC enabled</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_15><loc_11><loc_16><loc_83><loc_30></location>
|
<location><page_15><loc_11><loc_16><loc_83><loc_30></location>
|
||||||
<caption>Figure 4-69 Index advice with no RCAC</caption>
|
<caption>Figure 4-69 Index advice with no RCAC</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_15><loc_11><loc_15><loc_37><loc_16></location>Figure 4-69 Index advice with no RCAC</caption>
|
||||||
<paragraph><location><page_16><loc_11><loc_11><loc_82><loc_91></location>THEN C . CUSTOMER_TAX_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( C . CUSTOMER_TAX_ID , 8 , 4 ) ) WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_DRIVERS_LICENSE_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_DRIVERS_LICENSE_NUMBER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER ELSE '*************' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_LOGIN_ID_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_LOGIN_ID RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_LOGIN_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_LOGIN_ID ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ANSWER_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION_ANSWER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER ELSE '*****' END ENABLE ; ALTER TABLE BANK_SCHEMA.CUSTOMERS ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL ;</paragraph>
|
<paragraph><location><page_16><loc_11><loc_11><loc_82><loc_91></location>THEN C . CUSTOMER_TAX_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( C . CUSTOMER_TAX_ID , 8 , 4 ) ) WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_DRIVERS_LICENSE_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_DRIVERS_LICENSE_NUMBER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER ELSE '*************' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_LOGIN_ID_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_LOGIN_ID RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_LOGIN_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_LOGIN_ID ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ANSWER_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION_ANSWER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER ELSE '*****' END ENABLE ; ALTER TABLE BANK_SCHEMA.CUSTOMERS ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL ;</paragraph>
|
||||||
<paragraph><location><page_18><loc_47><loc_94><loc_68><loc_96></location>Back cover</paragraph>
|
<paragraph><location><page_18><loc_47><loc_94><loc_68><loc_96></location>Back cover</paragraph>
|
||||||
<subtitle-level-1><location><page_18><loc_4><loc_82><loc_73><loc_91></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
|
<subtitle-level-1><location><page_18><loc_4><loc_82><loc_73><loc_91></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
|
||||||
|
@ -1601,6 +1601,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/8"
|
"$ref": "#/figures/8"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
91.85700199999997,
|
||||||
|
316.44727,
|
||||||
|
100.18200000000002
|
||||||
|
],
|
||||||
|
"page": 7,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
43
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1-2 Existing row and column controls",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2375,6 +2398,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/9"
|
"$ref": "#/figures/9"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
369.53699,
|
||||||
|
341.97659,
|
||||||
|
377.862
|
||||||
|
],
|
||||||
|
"page": 10,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
42
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-1 CREATE PERMISSION SQL statement",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2615,6 +2661,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/10"
|
"$ref": "#/figures/10"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
186.95709,
|
||||||
|
341.25662,
|
||||||
|
195.2821
|
||||||
|
],
|
||||||
|
"page": 11,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
50
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-5 Special registers and adopted authority",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -3200,6 +3269,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/11"
|
"$ref": "#/figures/11"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
64.800003,
|
||||||
|
610.13702,
|
||||||
|
293.13809,
|
||||||
|
618.46198
|
||||||
|
],
|
||||||
|
"page": 14,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
52
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-10 Column masks shown in System i Navigator",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -3458,6 +3550,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/12"
|
"$ref": "#/figures/12"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
64.800003,
|
||||||
|
134.63710000000003,
|
||||||
|
347.43054,
|
||||||
|
142.96210999999994
|
||||||
|
],
|
||||||
|
"page": 14,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
65
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-11 Selecting the EMPLOYEES table from System i Navigator",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -3509,11 +3624,57 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/13"
|
"$ref": "#/figures/13"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
303.117,
|
||||||
|
327.09329,
|
||||||
|
311.44202
|
||||||
|
],
|
||||||
|
"page": 15,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
44
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 4-68 Visual Explain with RCAC enabled",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/14"
|
"$ref": "#/figures/14"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
64.800003,
|
||||||
|
116.15710000000001,
|
||||||
|
227.10149,
|
||||||
|
124.48209999999995
|
||||||
|
],
|
||||||
|
"page": 15,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
37
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 4-69 Index advice with no RCAC",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -336,9 +336,9 @@
|
|||||||
{
|
{
|
||||||
"page_no": 1,
|
"page_no": 1,
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 454.45458984375,
|
"t": 454.4546203613281,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 322.5054626464844,
|
"b": 322.5054626464844,
|
||||||
"coord_origin": "BOTTOMLEFT"
|
"coord_origin": "BOTTOMLEFT"
|
||||||
},
|
},
|
||||||
|
@ -2646,7 +2646,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373531937599182,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -2686,7 +2686,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.8858679533004761,
|
"confidence": 0.8858677744865417,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 1,
|
"index": 1,
|
||||||
@ -2726,7 +2726,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -3096,7 +3096,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591910243034363,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -3280,9 +3280,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -7787,7 +7787,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -8184,9 +8184,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -13582,7 +13582,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373531937599182,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -13628,7 +13628,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.8858679533004761,
|
"confidence": 0.8858677744865417,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 1,
|
"index": 1,
|
||||||
@ -13674,7 +13674,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -14062,7 +14062,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591910243034363,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -14252,9 +14252,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -19642,7 +19642,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -20057,7 +20057,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -20445,7 +20445,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591910243034363,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -20635,9 +20635,9 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.66746520996094,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.0093078613281,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
@ -26025,7 +26025,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -26440,7 +26440,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373531937599182,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -26486,7 +26486,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.8858679533004761,
|
"confidence": 0.8858677744865417,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 1,
|
"index": 1,
|
||||||
|
@ -245,7 +245,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "And that is an equation by itself. Cheers!",
|
"orig": "And that is an equation by itself. Cheers!",
|
||||||
"text": "And that is an equation by itself. Cheers!"
|
"text": "And that is an equation by itself. Cheers!",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/6",
|
"self_ref": "#/texts/6",
|
||||||
@ -269,7 +275,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "This is another equation:",
|
"orig": "This is another equation:",
|
||||||
"text": "This is another equation:"
|
"text": "This is another equation:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/8",
|
"self_ref": "#/texts/8",
|
||||||
@ -305,7 +317,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||||
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
|
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/11",
|
"self_ref": "#/texts/11",
|
||||||
@ -413,7 +431,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "And that is an equation by itself. Cheers!",
|
"orig": "And that is an equation by itself. Cheers!",
|
||||||
"text": "And that is an equation by itself. Cheers!"
|
"text": "And that is an equation by itself. Cheers!",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/20",
|
"self_ref": "#/texts/20",
|
||||||
@ -437,7 +461,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "This is another equation:",
|
"orig": "This is another equation:",
|
||||||
"text": "This is another equation:"
|
"text": "This is another equation:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/22",
|
"self_ref": "#/texts/22",
|
||||||
@ -485,7 +515,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||||
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
|
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/26",
|
"self_ref": "#/texts/26",
|
||||||
@ -593,7 +629,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "And that is an equation by itself. Cheers!",
|
"orig": "And that is an equation by itself. Cheers!",
|
||||||
"text": "And that is an equation by itself. Cheers!"
|
"text": "And that is an equation by itself. Cheers!",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/35",
|
"self_ref": "#/texts/35",
|
||||||
|
8
tests/data/groundtruth/docling_v2/example_08.html.itxt
vendored
Normal file
8
tests/data/groundtruth/docling_v2/example_08.html.itxt
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: section: group header-1
|
||||||
|
item-2 at level 2: section_header: Pivot table with with 1 row header
|
||||||
|
item-3 at level 3: table with [6x4]
|
||||||
|
item-4 at level 2: section_header: Pivot table with 2 row headers
|
||||||
|
item-5 at level 3: table with [6x5]
|
||||||
|
item-6 at level 2: section_header: Equivalent pivot table
|
||||||
|
item-7 at level 3: table with [6x5]
|
2008
tests/data/groundtruth/docling_v2/example_08.html.json
vendored
Normal file
2008
tests/data/groundtruth/docling_v2/example_08.html.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
29
tests/data/groundtruth/docling_v2/example_08.html.md
vendored
Normal file
29
tests/data/groundtruth/docling_v2/example_08.html.md
vendored
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
## Pivot table with with 1 row header
|
||||||
|
|
||||||
|
| Year | Month | Revenue | Cost |
|
||||||
|
|--------|----------|-----------|--------|
|
||||||
|
| 2025 | January | $134 | $162 |
|
||||||
|
| 2025 | February | $150 | $155 |
|
||||||
|
| 2025 | March | $160 | $143 |
|
||||||
|
| 2025 | April | $210 | $150 |
|
||||||
|
| 2025 | May | $280 | $120 |
|
||||||
|
|
||||||
|
## Pivot table with 2 row headers
|
||||||
|
|
||||||
|
| Year | Quarter | Month | Revenue | Cost |
|
||||||
|
|--------|-----------|----------|-----------|--------|
|
||||||
|
| 2025 | Q1 | January | $134 | $162 |
|
||||||
|
| 2025 | Q1 | February | $150 | $155 |
|
||||||
|
| 2025 | Q1 | March | $160 | $143 |
|
||||||
|
| 2025 | Q2 | April | $210 | $150 |
|
||||||
|
| 2025 | Q2 | May | $280 | $120 |
|
||||||
|
|
||||||
|
## Equivalent pivot table
|
||||||
|
|
||||||
|
| Year | Quarter | Month | Revenue | Cost |
|
||||||
|
|--------|-----------|----------|-----------|--------|
|
||||||
|
| 2025 | Q1 | January | $134 | $162 |
|
||||||
|
| 2025 | Q1 | February | $150 | $155 |
|
||||||
|
| 2025 | Q1 | March | $160 | $143 |
|
||||||
|
| 2025 | Q2 | April | $210 | $150 |
|
||||||
|
| 2025 | Q2 | May | $280 | $120 |
|
@ -61,7 +61,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
|
"orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
|
||||||
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi."
|
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/1",
|
"self_ref": "#/texts/1",
|
||||||
@ -85,7 +91,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
|
"orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
|
||||||
"text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet."
|
"text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/3",
|
"self_ref": "#/texts/3",
|
||||||
@ -109,7 +121,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
|
"orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
|
||||||
"text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus."
|
"text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/5",
|
"self_ref": "#/texts/5",
|
||||||
@ -133,7 +151,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
|
"orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
|
||||||
"text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl."
|
"text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/7",
|
"self_ref": "#/texts/7",
|
||||||
@ -157,7 +181,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
|
"orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
|
||||||
"text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh."
|
"text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"pictures": [],
|
"pictures": [],
|
||||||
|
66
tests/data/groundtruth/docling_v2/multi_page.doctags.txt
vendored
Normal file
66
tests/data/groundtruth/docling_v2/multi_page.doctags.txt
vendored
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
<doctag><section_header_level_1><loc_60><loc_43><loc_221><loc_51>The Evolution of the Word Processor</section_header_level_1>
|
||||||
|
<text><loc_60><loc_59><loc_418><loc_76>The concept of the word processor predates modern computers and has evolved through several technological milestones.</text>
|
||||||
|
<section_header_level_1><loc_60><loc_84><loc_274><loc_93>Pre-Digital Era (19th - Early 20th Century)</section_header_level_1>
|
||||||
|
<text><loc_60><loc_102><loc_427><loc_134>The origins of word processing can be traced back to the invention of the typewriter in the mid-19th century. Patented in 1868 by Christopher Latham Sholes, the typewriter revolutionized written communication by enabling people to produce legible, professional documents more efficiently than handwriting.</text>
|
||||||
|
<text><loc_60><loc_143><loc_424><loc_175>During this period, the term "word processing" didn't exist, but the typewriter laid the groundwork for future developments. Over time, advancements such as carbon paper (for copies) and the electric typewriter (introduced by IBM in 1935) improved the speed and convenience of document creation.</text>
|
||||||
|
<section_header_level_1><loc_60><loc_201><loc_283><loc_209>The Birth of Word Processing (1960s - 1970s)</section_header_level_1>
|
||||||
|
<text><loc_60><loc_218><loc_440><loc_242>The term "word processor" first emerged in the 1960s and referred to any system designed to streamline written communication and document production. Early word processors were not software programs but rather standalone machines.</text>
|
||||||
|
<unordered_list><list_item><loc_76><loc_251><loc_435><loc_283>· IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage.</list_item>
|
||||||
|
<list_item><loc_76><loc_284><loc_418><loc_308>· Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<text><loc_60><loc_316><loc_432><loc_333>These machines were primarily used in offices, where secretarial pools benefited from their ability to make revisions without retyping entire documents.</text>
|
||||||
|
<section_header_level_1><loc_60><loc_358><loc_258><loc_367>The Rise of Personal Computers (1980s)</section_header_level_1>
|
||||||
|
<text><loc_60><loc_375><loc_433><loc_392>The advent of personal computers in the late 1970s and early 1980s transformed word processing from a niche tool to an essential technology for businesses and individuals alike.</text>
|
||||||
|
<unordered_list><list_item><loc_76><loc_400><loc_439><loc_424>· WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste.</list_item>
|
||||||
|
<list_item><loc_76><loc_425><loc_441><loc_449>· Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<page_break>
|
||||||
|
<text><loc_60><loc_43><loc_434><loc_67>Other notable software from this era included WordPerfect, which was popular among legal professionals, and Apple's MacWrite, which leveraged the Macintosh's graphical capabilities.</text>
|
||||||
|
<section_header_level_1><loc_60><loc_93><loc_229><loc_101>The Modern Era (1990s - Present)</section_header_level_1>
|
||||||
|
<text><loc_60><loc_110><loc_429><loc_126>By the 1990s, word processing software had become more sophisticated, with features like spell check, grammar check, templates, and collaborative tools.</text>
|
||||||
|
<unordered_list><list_item><loc_76><loc_135><loc_413><loc_151>· Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint.</list_item>
|
||||||
|
<list_item><loc_76><loc_151><loc_435><loc_167>· OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options.</list_item>
|
||||||
|
<list_item><loc_76><loc_167><loc_441><loc_192>· Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<section_header_level_1><loc_60><loc_217><loc_195><loc_226>Future of Word Processing</section_header_level_1>
|
||||||
|
<text><loc_60><loc_234><loc_437><loc_275>Today, word processors are more than just tools for typing. They integrate artificial intelligence for grammar and style suggestions (e.g., Grammarly), voice-to-text features, and advanced layout options. As AI continues to advance, word processors may evolve into even more intuitive tools that predict user needs, automate repetitive tasks, and support richer multimedia integration.</text>
|
||||||
|
<text><loc_60><loc_300><loc_433><loc_325>From the clunky typewriters of the 19th century to the AI-powered cloud tools of today, the word processor has come a long way. It remains an essential tool for communication and creativity, shaping how we write and share ideas.</text>
|
||||||
|
<page_break>
|
||||||
|
<section_header_level_1><loc_60><loc_43><loc_232><loc_52>Specialized Word Processing Tools</section_header_level_1>
|
||||||
|
<text><loc_60><loc_60><loc_432><loc_85>In addition to general-purpose word processors, specialized tools have emerged to cater to specific industries and needs. These tools incorporate unique features tailored to their users' workflows:</text>
|
||||||
|
<unordered_list><list_item><loc_76><loc_93><loc_436><loc_134>· Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing.</list_item>
|
||||||
|
<list_item><loc_76><loc_134><loc_423><loc_167>· Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting.</list_item>
|
||||||
|
<list_item><loc_76><loc_167><loc_441><loc_200>· Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<section_header_level_1><loc_60><loc_225><loc_286><loc_234>Key Features That Changed Word Processing</section_header_level_1>
|
||||||
|
<text><loc_60><loc_242><loc_432><loc_267>The evolution of word processors wasn't just about hardware or software improvements-it was about the features that revolutionized how people wrote and edited. Some of these transformative features include:</text>
|
||||||
|
<unordered_list><list_item><loc_76><loc_275><loc_428><loc_291>1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.</list_item>
|
||||||
|
<list_item><loc_76><loc_292><loc_434><loc_308>2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.</list_item>
|
||||||
|
<list_item><loc_76><loc_308><loc_409><loc_324>3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.</list_item>
|
||||||
|
<list_item><loc_76><loc_324><loc_422><loc_340>4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.</list_item>
|
||||||
|
<list_item><loc_76><loc_341><loc_438><loc_365>5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<section_header_level_1><loc_60><loc_390><loc_262><loc_399>The Cultural Impact of Word Processors</section_header_level_1>
|
||||||
|
<text><loc_60><loc_408><loc_436><loc_432>The word processor didn't just change workplaces-it changed culture. It democratized writing, enabling anyone with access to a computer to produce professional-quality documents. This shift had profound implications for education, business, and creative fields:</text>
|
||||||
|
<page_break>
|
||||||
|
<unordered_list><list_item><loc_76><loc_43><loc_432><loc_67>· Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities.</list_item>
|
||||||
|
<list_item><loc_76><loc_67><loc_438><loc_92>· Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning.</list_item>
|
||||||
|
<list_item><loc_76><loc_92><loc_433><loc_117>· Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<section_header_level_1><loc_60><loc_142><loc_248><loc_151>Word Processors in a Post-Digital Era</section_header_level_1>
|
||||||
|
<text><loc_60><loc_159><loc_438><loc_167>As we move further into the 21st century, the role of the word processor continues to evolve:</text>
|
||||||
|
<unordered_list><list_item><loc_76><loc_176><loc_440><loc_208>1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.</list_item>
|
||||||
|
<list_item><loc_76><loc_208><loc_432><loc_241>2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.</list_item>
|
||||||
|
<list_item><loc_76><loc_241><loc_422><loc_274>3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.</list_item>
|
||||||
|
<list_item><loc_76><loc_274><loc_434><loc_298>4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.</list_item>
|
||||||
|
<list_item><loc_76><loc_299><loc_429><loc_323>5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<section_header_level_1><loc_60><loc_348><loc_192><loc_357>A Glimpse Into the Future</section_header_level_1>
|
||||||
|
<text><loc_60><loc_366><loc_433><loc_382>The word processor's future lies in adaptability and intelligence. Some exciting possibilities include:</text>
|
||||||
|
<unordered_list><list_item><loc_76><loc_390><loc_435><loc_406>· Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input.</list_item>
|
||||||
|
<list_item><loc_76><loc_407><loc_441><loc_431>· Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments.</list_item>
|
||||||
|
<list_item><loc_76><loc_431><loc_436><loc_447>· Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations.</list_item>
|
||||||
|
</unordered_list>
|
||||||
|
<page_break>
|
||||||
|
<text><loc_60><loc_59><loc_429><loc_100>The journey of the word processor-from clunky typewriters to AI-powered platformsreflects humanity's broader technological progress. What began as a tool to simply replace handwriting has transformed into a powerful ally for creativity, communication, and collaboration. As technology continues to advance, the word processor will undoubtedly remain at the heart of how we express ideas and connect with one another.</text>
|
||||||
|
</doctag>
|
1949
tests/data/groundtruth/docling_v2/multi_page.json
vendored
Normal file
1949
tests/data/groundtruth/docling_v2/multi_page.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
87
tests/data/groundtruth/docling_v2/multi_page.md
vendored
Normal file
87
tests/data/groundtruth/docling_v2/multi_page.md
vendored
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
## The Evolution of the Word Processor
|
||||||
|
|
||||||
|
The concept of the word processor predates modern computers and has evolved through several technological milestones.
|
||||||
|
|
||||||
|
## Pre-Digital Era (19th - Early 20th Century)
|
||||||
|
|
||||||
|
The origins of word processing can be traced back to the invention of the typewriter in the mid-19th century. Patented in 1868 by Christopher Latham Sholes, the typewriter revolutionized written communication by enabling people to produce legible, professional documents more efficiently than handwriting.
|
||||||
|
|
||||||
|
During this period, the term "word processing" didn't exist, but the typewriter laid the groundwork for future developments. Over time, advancements such as carbon paper (for copies) and the electric typewriter (introduced by IBM in 1935) improved the speed and convenience of document creation.
|
||||||
|
|
||||||
|
## The Birth of Word Processing (1960s - 1970s)
|
||||||
|
|
||||||
|
The term "word processor" first emerged in the 1960s and referred to any system designed to streamline written communication and document production. Early word processors were not software programs but rather standalone machines.
|
||||||
|
|
||||||
|
- · IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage.
|
||||||
|
- · Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time.
|
||||||
|
|
||||||
|
These machines were primarily used in offices, where secretarial pools benefited from their ability to make revisions without retyping entire documents.
|
||||||
|
|
||||||
|
## The Rise of Personal Computers (1980s)
|
||||||
|
|
||||||
|
The advent of personal computers in the late 1970s and early 1980s transformed word processing from a niche tool to an essential technology for businesses and individuals alike.
|
||||||
|
|
||||||
|
- · WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste.
|
||||||
|
- · Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing.
|
||||||
|
|
||||||
|
Other notable software from this era included WordPerfect, which was popular among legal professionals, and Apple's MacWrite, which leveraged the Macintosh's graphical capabilities.
|
||||||
|
|
||||||
|
## The Modern Era (1990s - Present)
|
||||||
|
|
||||||
|
By the 1990s, word processing software had become more sophisticated, with features like spell check, grammar check, templates, and collaborative tools.
|
||||||
|
|
||||||
|
- · Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint.
|
||||||
|
- · OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options.
|
||||||
|
- · Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work.
|
||||||
|
|
||||||
|
## Future of Word Processing
|
||||||
|
|
||||||
|
Today, word processors are more than just tools for typing. They integrate artificial intelligence for grammar and style suggestions (e.g., Grammarly), voice-to-text features, and advanced layout options. As AI continues to advance, word processors may evolve into even more intuitive tools that predict user needs, automate repetitive tasks, and support richer multimedia integration.
|
||||||
|
|
||||||
|
From the clunky typewriters of the 19th century to the AI-powered cloud tools of today, the word processor has come a long way. It remains an essential tool for communication and creativity, shaping how we write and share ideas.
|
||||||
|
|
||||||
|
## Specialized Word Processing Tools
|
||||||
|
|
||||||
|
In addition to general-purpose word processors, specialized tools have emerged to cater to specific industries and needs. These tools incorporate unique features tailored to their users' workflows:
|
||||||
|
|
||||||
|
- · Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing.
|
||||||
|
- · Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting.
|
||||||
|
- · Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs.
|
||||||
|
|
||||||
|
## Key Features That Changed Word Processing
|
||||||
|
|
||||||
|
The evolution of word processors wasn't just about hardware or software improvements-it was about the features that revolutionized how people wrote and edited. Some of these transformative features include:
|
||||||
|
|
||||||
|
- 1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.
|
||||||
|
- 2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.
|
||||||
|
- 3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.
|
||||||
|
- 4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.
|
||||||
|
- 5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.
|
||||||
|
|
||||||
|
## The Cultural Impact of Word Processors
|
||||||
|
|
||||||
|
The word processor didn't just change workplaces-it changed culture. It democratized writing, enabling anyone with access to a computer to produce professional-quality documents. This shift had profound implications for education, business, and creative fields:
|
||||||
|
|
||||||
|
- · Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities.
|
||||||
|
- · Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning.
|
||||||
|
- · Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes.
|
||||||
|
|
||||||
|
## Word Processors in a Post-Digital Era
|
||||||
|
|
||||||
|
As we move further into the 21st century, the role of the word processor continues to evolve:
|
||||||
|
|
||||||
|
- 1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.
|
||||||
|
- 2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.
|
||||||
|
- 3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.
|
||||||
|
- 4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.
|
||||||
|
- 5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.
|
||||||
|
|
||||||
|
## A Glimpse Into the Future
|
||||||
|
|
||||||
|
The word processor's future lies in adaptability and intelligence. Some exciting possibilities include:
|
||||||
|
|
||||||
|
- · Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input.
|
||||||
|
- · Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments.
|
||||||
|
- · Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations.
|
||||||
|
|
||||||
|
The journey of the word processor-from clunky typewriters to AI-powered platformsreflects humanity's broader technological progress. What began as a tool to simply replace handwriting has transformed into a powerful ally for creativity, communication, and collaboration. As technology continues to advance, the word processor will undoubtedly remain at the heart of how we express ideas and connect with one another.
|
21968
tests/data/groundtruth/docling_v2/multi_page.pages.json
vendored
Normal file
21968
tests/data/groundtruth/docling_v2/multi_page.pages.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -326,8 +326,8 @@
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"orig": "Let\u2019s introduce a list",
|
"orig": "Let’s introduce a list",
|
||||||
"text": "Let\u2019s introduce a list"
|
"text": "Let’s introduce a list"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/4",
|
"self_ref": "#/texts/4",
|
||||||
|
@ -74,6 +74,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Hello world1",
|
"orig": "Hello world1",
|
||||||
"text": "Hello world1",
|
"text": "Hello world1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -88,6 +94,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Hello2",
|
"orig": "Hello2",
|
||||||
"text": "Hello2",
|
"text": "Hello2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -113,7 +125,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some text before",
|
"orig": "Some text before",
|
||||||
"text": "Some text before"
|
"text": "Some text before",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/4",
|
"self_ref": "#/texts/4",
|
||||||
@ -149,7 +167,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some text after",
|
"orig": "Some text after",
|
||||||
"text": "Some text after"
|
"text": "Some text after",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"pictures": [],
|
"pictures": [],
|
||||||
|
@ -55,7 +55,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Test with three images in unusual formats",
|
"orig": "Test with three images in unusual formats",
|
||||||
"text": "Test with three images in unusual formats"
|
"text": "Test with three images in unusual formats",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/1",
|
"self_ref": "#/texts/1",
|
||||||
@ -67,7 +73,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Raster in emf:",
|
"orig": "Raster in emf:",
|
||||||
"text": "Raster in emf:"
|
"text": "Raster in emf:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/2",
|
"self_ref": "#/texts/2",
|
||||||
@ -79,7 +91,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Vector in emf:",
|
"orig": "Vector in emf:",
|
||||||
"text": "Vector in emf:"
|
"text": "Vector in emf:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/3",
|
"self_ref": "#/texts/3",
|
||||||
@ -91,7 +109,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Raster in webp:",
|
"orig": "Raster in webp:",
|
||||||
"text": "Raster in webp:"
|
"text": "Raster in webp:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"pictures": [
|
"pictures": [
|
||||||
|
94
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
Normal file
94
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
|
||||||
|
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
|
||||||
|
item-3 at level 1: paragraph:
|
||||||
|
item-4 at level 1: section: group textbox
|
||||||
|
item-5 at level 2: paragraph: Student falls ill
|
||||||
|
item-6 at level 2: paragraph:
|
||||||
|
item-7 at level 2: paragraph:
|
||||||
|
item-8 at level 2: list: group list
|
||||||
|
item-9 at level 3: list_item: Suggested Reportable Symptoms:
|
||||||
|
* ... sh
|
||||||
|
* Blisters
|
||||||
|
* Headache
|
||||||
|
* Sore throat
|
||||||
|
item-10 at level 1: list_item:
|
||||||
|
item-11 at level 1: paragraph:
|
||||||
|
item-12 at level 1: paragraph:
|
||||||
|
item-13 at level 1: section: group textbox
|
||||||
|
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||||
|
item-15 at level 1: paragraph:
|
||||||
|
item-16 at level 1: paragraph:
|
||||||
|
item-17 at level 1: paragraph:
|
||||||
|
item-18 at level 1: paragraph:
|
||||||
|
item-19 at level 1: section: group textbox
|
||||||
|
item-20 at level 2: paragraph: Yes
|
||||||
|
item-21 at level 1: paragraph:
|
||||||
|
item-22 at level 1: paragraph:
|
||||||
|
item-23 at level 1: section: group textbox
|
||||||
|
item-24 at level 2: paragraph: A report must be submitted wi ... saster Prevention Information Network.
|
||||||
|
item-25 at level 2: paragraph: A report must also be submitt ... d Infectious Disease Reporting System.
|
||||||
|
item-26 at level 2: paragraph:
|
||||||
|
item-27 at level 2: paragraph:
|
||||||
|
item-28 at level 1: paragraph:
|
||||||
|
item-29 at level 1: paragraph:
|
||||||
|
item-30 at level 1: paragraph:
|
||||||
|
item-31 at level 1: paragraph:
|
||||||
|
item-32 at level 1: paragraph:
|
||||||
|
item-33 at level 1: paragraph:
|
||||||
|
item-34 at level 1: section: group textbox
|
||||||
|
item-35 at level 2: paragraph: Health Bureau:
|
||||||
|
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||||
|
item-37 at level 2: list: group list
|
||||||
|
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||||
|
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||||
|
item-40 at level 2: paragraph:
|
||||||
|
item-41 at level 2: paragraph:
|
||||||
|
item-42 at level 1: list: group list
|
||||||
|
item-43 at level 2: list_item:
|
||||||
|
item-44 at level 1: paragraph:
|
||||||
|
item-45 at level 1: section: group textbox
|
||||||
|
item-46 at level 2: paragraph: Department of Education:
|
||||||
|
Collabo ... vention measures at all school levels.
|
||||||
|
item-47 at level 1: paragraph:
|
||||||
|
item-48 at level 1: paragraph:
|
||||||
|
item-49 at level 1: paragraph:
|
||||||
|
item-50 at level 1: paragraph:
|
||||||
|
item-51 at level 1: paragraph:
|
||||||
|
item-52 at level 1: paragraph:
|
||||||
|
item-53 at level 1: paragraph:
|
||||||
|
item-54 at level 1: section: group textbox
|
||||||
|
item-55 at level 2: inline: group group
|
||||||
|
item-56 at level 3: paragraph: The Health Bureau will handle
|
||||||
|
item-57 at level 3: paragraph: reporting and specimen collection
|
||||||
|
item-58 at level 3: paragraph: .
|
||||||
|
item-59 at level 2: paragraph:
|
||||||
|
item-60 at level 2: paragraph:
|
||||||
|
item-61 at level 1: paragraph:
|
||||||
|
item-62 at level 1: paragraph:
|
||||||
|
item-63 at level 1: paragraph:
|
||||||
|
item-64 at level 1: section: group textbox
|
||||||
|
item-65 at level 2: paragraph: Whether the epidemic has eased.
|
||||||
|
item-66 at level 2: paragraph:
|
||||||
|
item-67 at level 2: paragraph:
|
||||||
|
item-68 at level 1: paragraph:
|
||||||
|
item-69 at level 1: section: group textbox
|
||||||
|
item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||||
|
item-71 at level 2: paragraph: No
|
||||||
|
item-72 at level 1: paragraph:
|
||||||
|
item-73 at level 1: paragraph:
|
||||||
|
item-74 at level 1: section: group textbox
|
||||||
|
item-75 at level 1: paragraph:
|
||||||
|
item-76 at level 1: section: group textbox
|
||||||
|
item-77 at level 1: paragraph:
|
||||||
|
item-78 at level 1: paragraph:
|
||||||
|
item-79 at level 1: section: group textbox
|
||||||
|
item-80 at level 2: paragraph: Case closed.
|
||||||
|
item-81 at level 2: paragraph:
|
||||||
|
item-82 at level 2: paragraph:
|
||||||
|
item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||||
|
item-84 at level 1: paragraph:
|
||||||
|
item-85 at level 1: section: group textbox
|
||||||
|
item-86 at level 1: paragraph:
|
||||||
|
item-87 at level 1: paragraph:
|
||||||
|
item-88 at level 1: paragraph:
|
1470
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
Normal file
1470
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
46
tests/data/groundtruth/docling_v2/textbox.docx.md
vendored
Normal file
46
tests/data/groundtruth/docling_v2/textbox.docx.md
vendored
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
**Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten**
|
||||||
|
|
||||||
|
**Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten**
|
||||||
|
|
||||||
|
**Student falls ill**
|
||||||
|
|
||||||
|
- Suggested Reportable Symptoms:
|
||||||
|
* Fever
|
||||||
|
* Cough
|
||||||
|
* Diarrhea
|
||||||
|
* Vomiting
|
||||||
|
* Rash
|
||||||
|
* Blisters
|
||||||
|
* Headache
|
||||||
|
* Sore throat
|
||||||
|
|
||||||
|
If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)
|
||||||
|
show the same suggested reportable symptoms
|
||||||
|
|
||||||
|
Yes
|
||||||
|
|
||||||
|
A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
|
||||||
|
|
||||||
|
A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
|
||||||
|
|
||||||
|
**Health Bureau:**
|
||||||
|
|
||||||
|
Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.
|
||||||
|
|
||||||
|
- If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.
|
||||||
|
- Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.
|
||||||
|
|
||||||
|
Department of Education:
|
||||||
|
Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.
|
||||||
|
|
||||||
|
The Health Bureau will handle **reporting and specimen collection** .
|
||||||
|
|
||||||
|
**Whether the epidemic has eased.**
|
||||||
|
|
||||||
|
**Whether the test results are positive for a legally designated infectious disease.**
|
||||||
|
|
||||||
|
No
|
||||||
|
|
||||||
|
**Case closed.**
|
||||||
|
|
||||||
|
The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.
|
@ -232,6 +232,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "hyperlink",
|
"orig": "hyperlink",
|
||||||
"text": "hyperlink",
|
"text": "hyperlink",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"hyperlink": "https:/github.com/DS4SD/docling"
|
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -263,7 +269,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Normal",
|
"orig": "Normal",
|
||||||
"text": "Normal"
|
"text": "Normal",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/6",
|
"self_ref": "#/texts/6",
|
||||||
@ -329,7 +341,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "and",
|
"orig": "and",
|
||||||
"text": "and"
|
"text": "and",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/10",
|
"self_ref": "#/texts/10",
|
||||||
@ -342,6 +360,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "hyperlink",
|
"orig": "hyperlink",
|
||||||
"text": "hyperlink",
|
"text": "hyperlink",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"hyperlink": "https:/github.com/DS4SD/docling"
|
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -354,7 +378,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "on the same line",
|
"orig": "on the same line",
|
||||||
"text": "on the same line"
|
"text": "on the same line",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/12",
|
"self_ref": "#/texts/12",
|
||||||
@ -439,6 +469,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some",
|
"orig": "Some",
|
||||||
"text": "Some",
|
"text": "Some",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -513,6 +549,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Nested",
|
"orig": "Nested",
|
||||||
"text": "Nested",
|
"text": "Nested",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
|
@ -133,7 +133,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1",
|
"orig": "Paragraph 1.1",
|
||||||
"text": "Paragraph 1.1"
|
"text": "Paragraph 1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/5",
|
"self_ref": "#/texts/5",
|
||||||
@ -157,7 +163,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.2",
|
"orig": "Paragraph 1.2",
|
||||||
"text": "Paragraph 1.2"
|
"text": "Paragraph 1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/7",
|
"self_ref": "#/texts/7",
|
||||||
@ -222,7 +234,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.1",
|
"orig": "Paragraph 1.1.1",
|
||||||
"text": "Paragraph 1.1.1"
|
"text": "Paragraph 1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/11",
|
"self_ref": "#/texts/11",
|
||||||
@ -246,7 +264,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.2",
|
"orig": "Paragraph 1.1.2",
|
||||||
"text": "Paragraph 1.1.2"
|
"text": "Paragraph 1.1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/13",
|
"self_ref": "#/texts/13",
|
||||||
@ -314,7 +338,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.1",
|
"orig": "Paragraph 1.1.1",
|
||||||
"text": "Paragraph 1.1.1"
|
"text": "Paragraph 1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/17",
|
"self_ref": "#/texts/17",
|
||||||
@ -338,7 +368,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.2",
|
"orig": "Paragraph 1.1.2",
|
||||||
"text": "Paragraph 1.1.2"
|
"text": "Paragraph 1.1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/19",
|
"self_ref": "#/texts/19",
|
||||||
@ -406,7 +442,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.2.3.1",
|
"orig": "Paragraph 1.2.3.1",
|
||||||
"text": "Paragraph 1.2.3.1"
|
"text": "Paragraph 1.2.3.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/23",
|
"self_ref": "#/texts/23",
|
||||||
@ -430,7 +472,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.2.3.1",
|
"orig": "Paragraph 1.2.3.1",
|
||||||
"text": "Paragraph 1.2.3.1"
|
"text": "Paragraph 1.2.3.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/25",
|
"self_ref": "#/texts/25",
|
||||||
@ -513,7 +561,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1",
|
"orig": "Paragraph 2.1",
|
||||||
"text": "Paragraph 2.1"
|
"text": "Paragraph 2.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/30",
|
"self_ref": "#/texts/30",
|
||||||
@ -537,7 +591,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.2",
|
"orig": "Paragraph 2.2",
|
||||||
"text": "Paragraph 2.2"
|
"text": "Paragraph 2.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/32",
|
"self_ref": "#/texts/32",
|
||||||
@ -602,7 +662,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.1.1",
|
"orig": "Paragraph 2.1.1.1",
|
||||||
"text": "Paragraph 2.1.1.1"
|
"text": "Paragraph 2.1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/36",
|
"self_ref": "#/texts/36",
|
||||||
@ -626,7 +692,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.1.1",
|
"orig": "Paragraph 2.1.1.1",
|
||||||
"text": "Paragraph 2.1.1.1"
|
"text": "Paragraph 2.1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/38",
|
"self_ref": "#/texts/38",
|
||||||
@ -694,7 +766,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.1",
|
"orig": "Paragraph 2.1.1",
|
||||||
"text": "Paragraph 2.1.1"
|
"text": "Paragraph 2.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/42",
|
"self_ref": "#/texts/42",
|
||||||
@ -718,7 +796,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.2",
|
"orig": "Paragraph 2.1.2",
|
||||||
"text": "Paragraph 2.1.2"
|
"text": "Paragraph 2.1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/44",
|
"self_ref": "#/texts/44",
|
||||||
|
@ -209,7 +209,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1",
|
"orig": "Paragraph 1.1",
|
||||||
"text": "Paragraph 1.1"
|
"text": "Paragraph 1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/5",
|
"self_ref": "#/texts/5",
|
||||||
@ -233,7 +239,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.2",
|
"orig": "Paragraph 1.2",
|
||||||
"text": "Paragraph 1.2"
|
"text": "Paragraph 1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/7",
|
"self_ref": "#/texts/7",
|
||||||
@ -298,7 +310,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.1",
|
"orig": "Paragraph 1.1.1",
|
||||||
"text": "Paragraph 1.1.1"
|
"text": "Paragraph 1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/11",
|
"self_ref": "#/texts/11",
|
||||||
@ -322,7 +340,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.2",
|
"orig": "Paragraph 1.1.2",
|
||||||
"text": "Paragraph 1.1.2"
|
"text": "Paragraph 1.1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/13",
|
"self_ref": "#/texts/13",
|
||||||
@ -390,7 +414,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.1",
|
"orig": "Paragraph 1.1.1",
|
||||||
"text": "Paragraph 1.1.1"
|
"text": "Paragraph 1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/17",
|
"self_ref": "#/texts/17",
|
||||||
@ -414,7 +444,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.1.2",
|
"orig": "Paragraph 1.1.2",
|
||||||
"text": "Paragraph 1.1.2"
|
"text": "Paragraph 1.1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/19",
|
"self_ref": "#/texts/19",
|
||||||
@ -482,7 +518,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.2.3.1",
|
"orig": "Paragraph 1.2.3.1",
|
||||||
"text": "Paragraph 1.2.3.1"
|
"text": "Paragraph 1.2.3.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/23",
|
"self_ref": "#/texts/23",
|
||||||
@ -506,7 +548,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 1.2.3.1",
|
"orig": "Paragraph 1.2.3.1",
|
||||||
"text": "Paragraph 1.2.3.1"
|
"text": "Paragraph 1.2.3.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/25",
|
"self_ref": "#/texts/25",
|
||||||
@ -567,7 +615,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1",
|
"orig": "Paragraph 2.1",
|
||||||
"text": "Paragraph 2.1"
|
"text": "Paragraph 2.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/30",
|
"self_ref": "#/texts/30",
|
||||||
@ -591,7 +645,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.2",
|
"orig": "Paragraph 2.2",
|
||||||
"text": "Paragraph 2.2"
|
"text": "Paragraph 2.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/32",
|
"self_ref": "#/texts/32",
|
||||||
@ -656,7 +716,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.1.1",
|
"orig": "Paragraph 2.1.1.1",
|
||||||
"text": "Paragraph 2.1.1.1"
|
"text": "Paragraph 2.1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/36",
|
"self_ref": "#/texts/36",
|
||||||
@ -680,7 +746,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.1.1",
|
"orig": "Paragraph 2.1.1.1",
|
||||||
"text": "Paragraph 2.1.1.1"
|
"text": "Paragraph 2.1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/38",
|
"self_ref": "#/texts/38",
|
||||||
@ -748,7 +820,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.1",
|
"orig": "Paragraph 2.1.1",
|
||||||
"text": "Paragraph 2.1.1"
|
"text": "Paragraph 2.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/42",
|
"self_ref": "#/texts/42",
|
||||||
@ -772,7 +850,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.2",
|
"orig": "Paragraph 2.1.2",
|
||||||
"text": "Paragraph 2.1.2"
|
"text": "Paragraph 2.1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/44",
|
"self_ref": "#/texts/44",
|
||||||
|
@ -365,7 +365,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.1",
|
"orig": "Paragraph 2.1.1",
|
||||||
"text": "Paragraph 2.1.1"
|
"text": "Paragraph 2.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/4",
|
"self_ref": "#/texts/4",
|
||||||
@ -389,7 +395,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paragraph 2.1.2",
|
"orig": "Paragraph 2.1.2",
|
||||||
"text": "Paragraph 2.1.2"
|
"text": "Paragraph 2.1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/6",
|
"self_ref": "#/texts/6",
|
||||||
@ -434,6 +446,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1",
|
"orig": "List item 1",
|
||||||
"text": "List item 1",
|
"text": "List item 1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -448,6 +466,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 2",
|
"orig": "List item 2",
|
||||||
"text": "List item 2",
|
"text": "List item 2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -462,6 +486,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 3",
|
"orig": "List item 3",
|
||||||
"text": "List item 3",
|
"text": "List item 3",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -508,6 +538,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item a",
|
"orig": "List item a",
|
||||||
"text": "List item a",
|
"text": "List item a",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -522,6 +558,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item b",
|
"orig": "List item b",
|
||||||
"text": "List item b",
|
"text": "List item b",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -536,6 +578,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item c",
|
"orig": "List item c",
|
||||||
"text": "List item c",
|
"text": "List item c",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -582,6 +630,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1",
|
"orig": "List item 1",
|
||||||
"text": "List item 1",
|
"text": "List item 1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -596,6 +650,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 2",
|
"orig": "List item 2",
|
||||||
"text": "List item 2",
|
"text": "List item 2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -610,6 +670,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.1",
|
"orig": "List item 1.1",
|
||||||
"text": "List item 1.1",
|
"text": "List item 1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -624,6 +690,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.2",
|
"orig": "List item 1.2",
|
||||||
"text": "List item 1.2",
|
"text": "List item 1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -638,6 +710,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.3",
|
"orig": "List item 1.3",
|
||||||
"text": "List item 1.3",
|
"text": "List item 1.3",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -652,6 +730,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 3",
|
"orig": "List item 3",
|
||||||
"text": "List item 3",
|
"text": "List item 3",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -698,6 +782,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1",
|
"orig": "List item 1",
|
||||||
"text": "List item 1",
|
"text": "List item 1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -712,6 +802,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.1",
|
"orig": "List item 1.1",
|
||||||
"text": "List item 1.1",
|
"text": "List item 1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -726,6 +822,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 2",
|
"orig": "List item 2",
|
||||||
"text": "List item 2",
|
"text": "List item 2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -772,6 +874,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1",
|
"orig": "List item 1",
|
||||||
"text": "List item 1",
|
"text": "List item 1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -786,6 +894,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.1",
|
"orig": "List item 1.1",
|
||||||
"text": "List item 1.1",
|
"text": "List item 1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -800,6 +914,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.1.1",
|
"orig": "List item 1.1.1",
|
||||||
"text": "List item 1.1.1",
|
"text": "List item 1.1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -814,6 +934,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 3",
|
"orig": "List item 3",
|
||||||
"text": "List item 3",
|
"text": "List item 3",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -866,6 +992,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1",
|
"orig": "List item 1",
|
||||||
"text": "List item 1",
|
"text": "List item 1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -880,6 +1012,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 2",
|
"orig": "List item 2",
|
||||||
"text": "List item 2",
|
"text": "List item 2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -894,6 +1032,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.1",
|
"orig": "List item 1.1",
|
||||||
"text": "List item 1.1",
|
"text": "List item 1.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -908,6 +1052,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.2",
|
"orig": "List item 1.2",
|
||||||
"text": "List item 1.2",
|
"text": "List item 1.2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -922,6 +1072,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 1.2.1",
|
"orig": "List item 1.2.1",
|
||||||
"text": "List item 1.2.1",
|
"text": "List item 1.2.1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -936,6 +1092,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "List item 3",
|
"orig": "List item 3",
|
||||||
"text": "List item 3",
|
"text": "List item 3",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -101,7 +101,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Summer activities",
|
"orig": "Summer activities",
|
||||||
"text": "Summer activities"
|
"text": "Summer activities",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/1",
|
"self_ref": "#/texts/1",
|
||||||
@ -138,7 +144,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Duck",
|
"orig": "Duck",
|
||||||
"text": "Duck"
|
"text": "Duck",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/3",
|
"self_ref": "#/texts/3",
|
||||||
@ -150,7 +162,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Figure 1: This is a cute duckling",
|
"orig": "Figure 1: This is a cute duckling",
|
||||||
"text": "Figure 1: This is a cute duckling"
|
"text": "Figure 1: This is a cute duckling",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/4",
|
"self_ref": "#/texts/4",
|
||||||
@ -180,8 +198,8 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "section_header",
|
"label": "section_header",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Let\u2019s swim!",
|
"orig": "Let’s swim!",
|
||||||
"text": "Let\u2019s swim!",
|
"text": "Let’s swim!",
|
||||||
"level": 1
|
"level": 1
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -194,7 +212,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
|
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||||
"text": "To get started with swimming, first lay down in a water and try not to drown:"
|
"text": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/6",
|
"self_ref": "#/texts/6",
|
||||||
@ -207,6 +231,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "You can relax and look around",
|
"orig": "You can relax and look around",
|
||||||
"text": "You can relax and look around",
|
"text": "You can relax and look around",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -221,6 +251,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Paddle about",
|
"orig": "Paddle about",
|
||||||
"text": "Paddle about",
|
"text": "Paddle about",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -235,6 +271,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Enjoy summer warmth",
|
"orig": "Enjoy summer warmth",
|
||||||
"text": "Enjoy summer warmth",
|
"text": "Enjoy summer warmth",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -247,8 +289,14 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Also, don\u2019t forget:",
|
"orig": "Also, don’t forget:",
|
||||||
"text": "Also, don\u2019t forget:"
|
"text": "Also, don’t forget:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/10",
|
"self_ref": "#/texts/10",
|
||||||
@ -261,6 +309,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Wear sunglasses",
|
"orig": "Wear sunglasses",
|
||||||
"text": "Wear sunglasses",
|
"text": "Wear sunglasses",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -273,8 +327,14 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Don\u2019t forget to drink water",
|
"orig": "Don’t forget to drink water",
|
||||||
"text": "Don\u2019t forget to drink water",
|
"text": "Don’t forget to drink water",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -289,6 +349,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Use sun cream",
|
"orig": "Use sun cream",
|
||||||
"text": "Use sun cream",
|
"text": "Use sun cream",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -301,8 +367,14 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Hmm, what else\u2026",
|
"orig": "Hmm, what else…",
|
||||||
"text": "Hmm, what else\u2026"
|
"text": "Hmm, what else…",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/14",
|
"self_ref": "#/texts/14",
|
||||||
@ -335,8 +407,8 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "section_header",
|
"label": "section_header",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Let\u2019s eat",
|
"orig": "Let’s eat",
|
||||||
"text": "Let\u2019s eat",
|
"text": "Let’s eat",
|
||||||
"level": 2
|
"level": 2
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -348,8 +420,14 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice",
|
"orig": "After we had a good day of swimming in the lake, it’s important to eat something nice",
|
||||||
"text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice"
|
"text": "After we had a good day of swimming in the lake, it’s important to eat something nice",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/16",
|
"self_ref": "#/texts/16",
|
||||||
@ -361,7 +439,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "I like to eat leaves",
|
"orig": "I like to eat leaves",
|
||||||
"text": "I like to eat leaves"
|
"text": "I like to eat leaves",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/17",
|
"self_ref": "#/texts/17",
|
||||||
@ -373,7 +457,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Here are some interesting things a respectful duck could eat:",
|
"orig": "Here are some interesting things a respectful duck could eat:",
|
||||||
"text": "Here are some interesting things a respectful duck could eat:"
|
"text": "Here are some interesting things a respectful duck could eat:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/18",
|
"self_ref": "#/texts/18",
|
||||||
@ -396,8 +486,14 @@
|
|||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "And let\u2019s add another list in the end:",
|
"orig": "And let’s add another list in the end:",
|
||||||
"text": "And let\u2019s add another list in the end:"
|
"text": "And let’s add another list in the end:",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/20",
|
"self_ref": "#/texts/20",
|
||||||
@ -410,6 +506,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Leaves",
|
"orig": "Leaves",
|
||||||
"text": "Leaves",
|
"text": "Leaves",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -424,6 +526,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Berries",
|
"orig": "Berries",
|
||||||
"text": "Berries",
|
"text": "Berries",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -438,6 +546,12 @@
|
|||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Grain",
|
"orig": "Grain",
|
||||||
"text": "Grain",
|
"text": "Grain",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
}
|
}
|
||||||
|
@ -114,7 +114,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A uniform table",
|
"orig": "A uniform table",
|
||||||
"text": "A uniform table"
|
"text": "A uniform table",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/2",
|
"self_ref": "#/texts/2",
|
||||||
@ -138,7 +144,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A non-uniform table with horizontal spans",
|
"orig": "A non-uniform table with horizontal spans",
|
||||||
"text": "A non-uniform table with horizontal spans"
|
"text": "A non-uniform table with horizontal spans",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/4",
|
"self_ref": "#/texts/4",
|
||||||
@ -162,7 +174,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A non-uniform table with horizontal spans in inner columns",
|
"orig": "A non-uniform table with horizontal spans in inner columns",
|
||||||
"text": "A non-uniform table with horizontal spans in inner columns"
|
"text": "A non-uniform table with horizontal spans in inner columns",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/6",
|
"self_ref": "#/texts/6",
|
||||||
@ -186,7 +204,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A non-uniform table with vertical spans",
|
"orig": "A non-uniform table with vertical spans",
|
||||||
"text": "A non-uniform table with vertical spans"
|
"text": "A non-uniform table with vertical spans",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/8",
|
"self_ref": "#/texts/8",
|
||||||
@ -210,7 +234,13 @@
|
|||||||
"label": "paragraph",
|
"label": "paragraph",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "A non-uniform table with all kinds of spans and empty cells",
|
"orig": "A non-uniform table with all kinds of spans and empty cells",
|
||||||
"text": "A non-uniform table with all kinds of spans and empty cells"
|
"text": "A non-uniform table with all kinds of spans and empty cells",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/10",
|
"self_ref": "#/texts/10",
|
||||||
|
145
tests/data/html/example_08.html
vendored
Normal file
145
tests/data/html/example_08.html
vendored
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
|
||||||
|
<head>
|
||||||
|
<style>
|
||||||
|
table,
|
||||||
|
th,
|
||||||
|
td {
|
||||||
|
border: 1px solid black;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h2>Pivot table with with 1 row header</h2>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Year</th>
|
||||||
|
<th>Month</th>
|
||||||
|
<th>Revenue</th>
|
||||||
|
<th>Cost</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="6">2025</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>January</td>
|
||||||
|
<td>$134</td>
|
||||||
|
<td>$162</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>February</td>
|
||||||
|
<td>$150</td>
|
||||||
|
<td>$155</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>March</td>
|
||||||
|
<td>$160</td>
|
||||||
|
<td>$143</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>April</td>
|
||||||
|
<td>$210</td>
|
||||||
|
<td>$150</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>May</td>
|
||||||
|
<td>$280</td>
|
||||||
|
<td>$120</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<h2>Pivot table with 2 row headers</h2>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Year</th>
|
||||||
|
<th>Quarter</th>
|
||||||
|
<th>Month</th>
|
||||||
|
<th>Revenue</th>
|
||||||
|
<th>Cost</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="7">2025</th>
|
||||||
|
<th rowspan="4">Q1</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>January</td>
|
||||||
|
<td>$134</td>
|
||||||
|
<td>$162</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>February</td>
|
||||||
|
<td>$150</td>
|
||||||
|
<td>$155</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>March</td>
|
||||||
|
<td>$160</td>
|
||||||
|
<td>$143</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="3">Q2</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>April</td>
|
||||||
|
<td>$210</td>
|
||||||
|
<td>$150</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>May</td>
|
||||||
|
<td>$280</td>
|
||||||
|
<td>$120</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<h2>Equivalent pivot table</h2>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Year</th>
|
||||||
|
<th>Quarter</th>
|
||||||
|
<th>Month</th>
|
||||||
|
<th>Revenue</th>
|
||||||
|
<th>Cost</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="8">2025</th>
|
||||||
|
<th rowspan="4">Q1</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>January</td>
|
||||||
|
<td>$134</td>
|
||||||
|
<td>$162</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>February</td>
|
||||||
|
<td>$150</td>
|
||||||
|
<td>$155</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>March</td>
|
||||||
|
<td>$160</td>
|
||||||
|
<td>$143</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="3">Q2</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>April</td>
|
||||||
|
<td>$210</td>
|
||||||
|
<td>$150</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>May</td>
|
||||||
|
<td>$280</td>
|
||||||
|
<td>$120</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
BIN
tests/data/pdf/multi_page.pdf
vendored
Normal file
BIN
tests/data/pdf/multi_page.pdf
vendored
Normal file
Binary file not shown.
2
tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt
vendored
Normal file
2
tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||||
|
</doctag>
|
77
tests/data/webp/groundtruth/docling_v2/webp-test.json
vendored
Normal file
77
tests/data/webp/groundtruth/docling_v2/webp-test.json
vendored
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
{
|
||||||
|
"schema_name": "DoclingDocument",
|
||||||
|
"version": "1.3.0",
|
||||||
|
"name": "webp-test",
|
||||||
|
"origin": {
|
||||||
|
"mimetype": "application/pdf",
|
||||||
|
"binary_hash": 16115062463007057787,
|
||||||
|
"filename": "webp-test.webp",
|
||||||
|
"uri": null
|
||||||
|
},
|
||||||
|
"furniture": {
|
||||||
|
"self_ref": "#/furniture",
|
||||||
|
"parent": null,
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"self_ref": "#/body",
|
||||||
|
"parent": null,
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"cref": "#/texts/0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"groups": [],
|
||||||
|
"texts": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/0",
|
||||||
|
"parent": {
|
||||||
|
"cref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"page_no": 1,
|
||||||
|
"bbox": {
|
||||||
|
"l": 238.19302423176944,
|
||||||
|
"t": 2570.0959833241664,
|
||||||
|
"r": 1696.0985546594009,
|
||||||
|
"b": 2315.204273887442,
|
||||||
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
},
|
||||||
|
"charspan": [
|
||||||
|
0,
|
||||||
|
94
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
||||||
|
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
||||||
|
"formatting": null,
|
||||||
|
"hyperlink": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pictures": [],
|
||||||
|
"tables": [],
|
||||||
|
"key_value_items": [],
|
||||||
|
"form_items": [],
|
||||||
|
"pages": {
|
||||||
|
"1": {
|
||||||
|
"size": {
|
||||||
|
"width": 2000.0,
|
||||||
|
"height": 2829.0
|
||||||
|
},
|
||||||
|
"image": null,
|
||||||
|
"page_no": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
1
tests/data/webp/groundtruth/docling_v2/webp-test.md
vendored
Normal file
1
tests/data/webp/groundtruth/docling_v2/webp-test.md
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
388
tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
vendored
Normal file
388
tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
vendored
Normal file
@ -0,0 +1,388 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"page_no": 0,
|
||||||
|
"size": {
|
||||||
|
"width": 2000.0,
|
||||||
|
"height": 2829.0
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 246.4065456254215,
|
||||||
|
"r_y0": 329.06770715202435,
|
||||||
|
"r_x1": 1691.991797818404,
|
||||||
|
"r_y1": 329.06770715202435,
|
||||||
|
"r_x2": 1691.991797818404,
|
||||||
|
"r_y2": 258.9040166758338,
|
||||||
|
"r_x3": 246.4065456254215,
|
||||||
|
"r_y3": 258.9040166758338,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "Docling bundles PDF document conversion to",
|
||||||
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 238.19302423176944,
|
||||||
|
"r_y0": 415.36904822716525,
|
||||||
|
"r_x1": 1696.0985546594009,
|
||||||
|
"r_y1": 415.36904822716525,
|
||||||
|
"r_x2": 1696.0985546594009,
|
||||||
|
"r_y2": 345.20535775097477,
|
||||||
|
"r_x3": 238.19302423176944,
|
||||||
|
"r_y3": 345.20535775097477,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 2,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 245.43122061153045,
|
||||||
|
"r_y0": 513.795726112558,
|
||||||
|
"r_x1": 514.3223724413002,
|
||||||
|
"r_y1": 513.795726112558,
|
||||||
|
"r_x2": 514.3223724413002,
|
||||||
|
"r_y2": 436.0574704074058,
|
||||||
|
"r_x3": 245.43122061153045,
|
||||||
|
"r_y3": 436.0574704074058,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "package",
|
||||||
|
"orig": "package",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"parsed_page": null,
|
||||||
|
"predictions": {
|
||||||
|
"layout": {
|
||||||
|
"clusters": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"label": "text",
|
||||||
|
"bbox": {
|
||||||
|
"l": 238.19302423176944,
|
||||||
|
"t": 258.9040166758338,
|
||||||
|
"r": 1696.0985546594009,
|
||||||
|
"b": 513.795726112558,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"confidence": 0.9721010327339172,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 246.4065456254215,
|
||||||
|
"r_y0": 329.06770715202435,
|
||||||
|
"r_x1": 1691.991797818404,
|
||||||
|
"r_y1": 329.06770715202435,
|
||||||
|
"r_x2": 1691.991797818404,
|
||||||
|
"r_y2": 258.9040166758338,
|
||||||
|
"r_x3": 246.4065456254215,
|
||||||
|
"r_y3": 258.9040166758338,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "Docling bundles PDF document conversion to",
|
||||||
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 238.19302423176944,
|
||||||
|
"r_y0": 415.36904822716525,
|
||||||
|
"r_x1": 1696.0985546594009,
|
||||||
|
"r_y1": 415.36904822716525,
|
||||||
|
"r_x2": 1696.0985546594009,
|
||||||
|
"r_y2": 345.20535775097477,
|
||||||
|
"r_x3": 238.19302423176944,
|
||||||
|
"r_y3": 345.20535775097477,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 2,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 245.43122061153045,
|
||||||
|
"r_y0": 513.795726112558,
|
||||||
|
"r_x1": 514.3223724413002,
|
||||||
|
"r_y1": 513.795726112558,
|
||||||
|
"r_x2": 514.3223724413002,
|
||||||
|
"r_y2": 436.0574704074058,
|
||||||
|
"r_x3": 245.43122061153045,
|
||||||
|
"r_y3": 436.0574704074058,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "package",
|
||||||
|
"orig": "package",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"children": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"tablestructure": {
|
||||||
|
"table_map": {}
|
||||||
|
},
|
||||||
|
"figures_classification": null,
|
||||||
|
"equations_prediction": null,
|
||||||
|
"vlm_response": null
|
||||||
|
},
|
||||||
|
"assembled": {
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"label": "text",
|
||||||
|
"id": 0,
|
||||||
|
"page_no": 0,
|
||||||
|
"cluster": {
|
||||||
|
"id": 0,
|
||||||
|
"label": "text",
|
||||||
|
"bbox": {
|
||||||
|
"l": 238.19302423176944,
|
||||||
|
"t": 258.9040166758338,
|
||||||
|
"r": 1696.0985546594009,
|
||||||
|
"b": 513.795726112558,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"confidence": 0.9721010327339172,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 246.4065456254215,
|
||||||
|
"r_y0": 329.06770715202435,
|
||||||
|
"r_x1": 1691.991797818404,
|
||||||
|
"r_y1": 329.06770715202435,
|
||||||
|
"r_x2": 1691.991797818404,
|
||||||
|
"r_y2": 258.9040166758338,
|
||||||
|
"r_x3": 246.4065456254215,
|
||||||
|
"r_y3": 258.9040166758338,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "Docling bundles PDF document conversion to",
|
||||||
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 238.19302423176944,
|
||||||
|
"r_y0": 415.36904822716525,
|
||||||
|
"r_x1": 1696.0985546594009,
|
||||||
|
"r_y1": 415.36904822716525,
|
||||||
|
"r_x2": 1696.0985546594009,
|
||||||
|
"r_y2": 345.20535775097477,
|
||||||
|
"r_x3": 238.19302423176944,
|
||||||
|
"r_y3": 345.20535775097477,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 2,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 245.43122061153045,
|
||||||
|
"r_y0": 513.795726112558,
|
||||||
|
"r_x1": 514.3223724413002,
|
||||||
|
"r_y1": 513.795726112558,
|
||||||
|
"r_x2": 514.3223724413002,
|
||||||
|
"r_y2": 436.0574704074058,
|
||||||
|
"r_x3": 245.43122061153045,
|
||||||
|
"r_y3": 436.0574704074058,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "package",
|
||||||
|
"orig": "package",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"children": []
|
||||||
|
},
|
||||||
|
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"body": [
|
||||||
|
{
|
||||||
|
"label": "text",
|
||||||
|
"id": 0,
|
||||||
|
"page_no": 0,
|
||||||
|
"cluster": {
|
||||||
|
"id": 0,
|
||||||
|
"label": "text",
|
||||||
|
"bbox": {
|
||||||
|
"l": 238.19302423176944,
|
||||||
|
"t": 258.9040166758338,
|
||||||
|
"r": 1696.0985546594009,
|
||||||
|
"b": 513.795726112558,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"confidence": 0.9721010327339172,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 246.4065456254215,
|
||||||
|
"r_y0": 329.06770715202435,
|
||||||
|
"r_x1": 1691.991797818404,
|
||||||
|
"r_y1": 329.06770715202435,
|
||||||
|
"r_x2": 1691.991797818404,
|
||||||
|
"r_y2": 258.9040166758338,
|
||||||
|
"r_x3": 246.4065456254215,
|
||||||
|
"r_y3": 258.9040166758338,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "Docling bundles PDF document conversion to",
|
||||||
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 238.19302423176944,
|
||||||
|
"r_y0": 415.36904822716525,
|
||||||
|
"r_x1": 1696.0985546594009,
|
||||||
|
"r_y1": 415.36904822716525,
|
||||||
|
"r_x2": 1696.0985546594009,
|
||||||
|
"r_y2": 345.20535775097477,
|
||||||
|
"r_x3": 238.19302423176944,
|
||||||
|
"r_y3": 345.20535775097477,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 2,
|
||||||
|
"rgba": {
|
||||||
|
"r": 0,
|
||||||
|
"g": 0,
|
||||||
|
"b": 0,
|
||||||
|
"a": 255
|
||||||
|
},
|
||||||
|
"rect": {
|
||||||
|
"r_x0": 245.43122061153045,
|
||||||
|
"r_y0": 513.795726112558,
|
||||||
|
"r_x1": 514.3223724413002,
|
||||||
|
"r_y1": 513.795726112558,
|
||||||
|
"r_x2": 514.3223724413002,
|
||||||
|
"r_y2": 436.0574704074058,
|
||||||
|
"r_x3": 245.43122061153045,
|
||||||
|
"r_y3": 436.0574704074058,
|
||||||
|
"coord_origin": "TOPLEFT"
|
||||||
|
},
|
||||||
|
"text": "package",
|
||||||
|
"orig": "package",
|
||||||
|
"text_direction": "left_to_right",
|
||||||
|
"confidence": 1.0,
|
||||||
|
"from_ocr": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"children": []
|
||||||
|
},
|
||||||
|
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"headers": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
BIN
tests/data/webp/webp-test.webp
vendored
Normal file
BIN
tests/data/webp/webp-test.webp
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 29 KiB |
@ -44,10 +44,10 @@
|
|||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
"bbox": [
|
"bbox": [
|
||||||
69.0,
|
70.90211866351085,
|
||||||
688.5883585611979,
|
689.216658542347,
|
||||||
506.6666666666667,
|
504.8720079864275,
|
||||||
767.2550252278646
|
764.9216921155637
|
||||||
],
|
],
|
||||||
"page": 1,
|
"page": 1,
|
||||||
"span": [
|
"span": [
|
||||||
|
@ -15,20 +15,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 71.33333333333333,
|
"r_x0": 73.34702132031646,
|
||||||
"r_y0": 99.33333333333333,
|
"r_y0": 97.99999977896755,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 503.64955224479564,
|
||||||
"r_y1": 99.33333333333333,
|
"r_y1": 97.99999977896755,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 503.64955224479564,
|
||||||
"r_y2": 74.66666666666667,
|
"r_y2": 76.99999977896756,
|
||||||
"r_x3": 71.33333333333333,
|
"r_x3": 73.34702132031646,
|
||||||
"r_y3": 74.66666666666667,
|
"r_y3": 76.99999977896756,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "Docling bundles PDF document conversion to",
|
"text": "Docling bundles PDF document conversion to",
|
||||||
"orig": "Docling bundles PDF document conversion to",
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9555703127793324,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -40,20 +40,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 69.0,
|
"r_x0": 70.90211866351085,
|
||||||
"r_y0": 126.66666666666667,
|
"r_y0": 124.83139551297342,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 504.8720079864275,
|
||||||
"r_y1": 126.66666666666667,
|
"r_y1": 124.83139551297342,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 504.8720079864275,
|
||||||
"r_y2": 100.66666666666667,
|
"r_y2": 102.66666671251768,
|
||||||
"r_x3": 69.0,
|
"r_x3": 70.90211866351085,
|
||||||
"r_y3": 100.66666666666667,
|
"r_y3": 102.66666671251768,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "JSON and Markdown in an easy self contained",
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
"orig": "JSON and Markdown in an easy self contained",
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9741098171752292,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -65,20 +65,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 70.66666666666667,
|
"r_x0": 73.10852522817731,
|
||||||
"r_y0": 153.33333333333334,
|
"r_y0": 152.70503335218433,
|
||||||
"r_x1": 154.0,
|
"r_x1": 153.04479435252625,
|
||||||
"r_y1": 153.33333333333334,
|
"r_y1": 152.70503335218433,
|
||||||
"r_x2": 154.0,
|
"r_x2": 153.04479435252625,
|
||||||
"r_y2": 128.66666666666666,
|
"r_y2": 130.00136157890958,
|
||||||
"r_x3": 70.66666666666667,
|
"r_x3": 73.10852522817731,
|
||||||
"r_y3": 128.66666666666666,
|
"r_y3": 130.00136157890958,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "package",
|
"text": "package",
|
||||||
"orig": "package",
|
"orig": "package",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.6702765056141881,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -90,10 +90,10 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "text",
|
"label": "text",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 69.0,
|
"l": 70.90211866351085,
|
||||||
"t": 74.66666666666667,
|
"t": 76.99999977896756,
|
||||||
"r": 506.6666666666667,
|
"r": 504.8720079864275,
|
||||||
"b": 153.33333333333334,
|
"b": 152.70503335218433,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9715733528137207,
|
"confidence": 0.9715733528137207,
|
||||||
@ -107,20 +107,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 71.33333333333333,
|
"r_x0": 73.34702132031646,
|
||||||
"r_y0": 99.33333333333333,
|
"r_y0": 97.99999977896755,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 503.64955224479564,
|
||||||
"r_y1": 99.33333333333333,
|
"r_y1": 97.99999977896755,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 503.64955224479564,
|
||||||
"r_y2": 74.66666666666667,
|
"r_y2": 76.99999977896756,
|
||||||
"r_x3": 71.33333333333333,
|
"r_x3": 73.34702132031646,
|
||||||
"r_y3": 74.66666666666667,
|
"r_y3": 76.99999977896756,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "Docling bundles PDF document conversion to",
|
"text": "Docling bundles PDF document conversion to",
|
||||||
"orig": "Docling bundles PDF document conversion to",
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9555703127793324,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -132,20 +132,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 69.0,
|
"r_x0": 70.90211866351085,
|
||||||
"r_y0": 126.66666666666667,
|
"r_y0": 124.83139551297342,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 504.8720079864275,
|
||||||
"r_y1": 126.66666666666667,
|
"r_y1": 124.83139551297342,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 504.8720079864275,
|
||||||
"r_y2": 100.66666666666667,
|
"r_y2": 102.66666671251768,
|
||||||
"r_x3": 69.0,
|
"r_x3": 70.90211866351085,
|
||||||
"r_y3": 100.66666666666667,
|
"r_y3": 102.66666671251768,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "JSON and Markdown in an easy self contained",
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
"orig": "JSON and Markdown in an easy self contained",
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9741098171752292,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -157,20 +157,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 70.66666666666667,
|
"r_x0": 73.10852522817731,
|
||||||
"r_y0": 153.33333333333334,
|
"r_y0": 152.70503335218433,
|
||||||
"r_x1": 154.0,
|
"r_x1": 153.04479435252625,
|
||||||
"r_y1": 153.33333333333334,
|
"r_y1": 152.70503335218433,
|
||||||
"r_x2": 154.0,
|
"r_x2": 153.04479435252625,
|
||||||
"r_y2": 128.66666666666666,
|
"r_y2": 130.00136157890958,
|
||||||
"r_x3": 70.66666666666667,
|
"r_x3": 73.10852522817731,
|
||||||
"r_y3": 128.66666666666666,
|
"r_y3": 130.00136157890958,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "package",
|
"text": "package",
|
||||||
"orig": "package",
|
"orig": "package",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.6702765056141881,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -195,10 +195,10 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "text",
|
"label": "text",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 69.0,
|
"l": 70.90211866351085,
|
||||||
"t": 74.66666666666667,
|
"t": 76.99999977896756,
|
||||||
"r": 506.6666666666667,
|
"r": 504.8720079864275,
|
||||||
"b": 153.33333333333334,
|
"b": 152.70503335218433,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9715733528137207,
|
"confidence": 0.9715733528137207,
|
||||||
@ -212,20 +212,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 71.33333333333333,
|
"r_x0": 73.34702132031646,
|
||||||
"r_y0": 99.33333333333333,
|
"r_y0": 97.99999977896755,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 503.64955224479564,
|
||||||
"r_y1": 99.33333333333333,
|
"r_y1": 97.99999977896755,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 503.64955224479564,
|
||||||
"r_y2": 74.66666666666667,
|
"r_y2": 76.99999977896756,
|
||||||
"r_x3": 71.33333333333333,
|
"r_x3": 73.34702132031646,
|
||||||
"r_y3": 74.66666666666667,
|
"r_y3": 76.99999977896756,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "Docling bundles PDF document conversion to",
|
"text": "Docling bundles PDF document conversion to",
|
||||||
"orig": "Docling bundles PDF document conversion to",
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9555703127793324,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -237,20 +237,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 69.0,
|
"r_x0": 70.90211866351085,
|
||||||
"r_y0": 126.66666666666667,
|
"r_y0": 124.83139551297342,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 504.8720079864275,
|
||||||
"r_y1": 126.66666666666667,
|
"r_y1": 124.83139551297342,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 504.8720079864275,
|
||||||
"r_y2": 100.66666666666667,
|
"r_y2": 102.66666671251768,
|
||||||
"r_x3": 69.0,
|
"r_x3": 70.90211866351085,
|
||||||
"r_y3": 100.66666666666667,
|
"r_y3": 102.66666671251768,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "JSON and Markdown in an easy self contained",
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
"orig": "JSON and Markdown in an easy self contained",
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9741098171752292,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -262,20 +262,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 70.66666666666667,
|
"r_x0": 73.10852522817731,
|
||||||
"r_y0": 153.33333333333334,
|
"r_y0": 152.70503335218433,
|
||||||
"r_x1": 154.0,
|
"r_x1": 153.04479435252625,
|
||||||
"r_y1": 153.33333333333334,
|
"r_y1": 152.70503335218433,
|
||||||
"r_x2": 154.0,
|
"r_x2": 153.04479435252625,
|
||||||
"r_y2": 128.66666666666666,
|
"r_y2": 130.00136157890958,
|
||||||
"r_x3": 70.66666666666667,
|
"r_x3": 73.10852522817731,
|
||||||
"r_y3": 128.66666666666666,
|
"r_y3": 130.00136157890958,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "package",
|
"text": "package",
|
||||||
"orig": "package",
|
"orig": "package",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.6702765056141881,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -293,10 +293,10 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "text",
|
"label": "text",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 69.0,
|
"l": 70.90211866351085,
|
||||||
"t": 74.66666666666667,
|
"t": 76.99999977896756,
|
||||||
"r": 506.6666666666667,
|
"r": 504.8720079864275,
|
||||||
"b": 153.33333333333334,
|
"b": 152.70503335218433,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9715733528137207,
|
"confidence": 0.9715733528137207,
|
||||||
@ -310,20 +310,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 71.33333333333333,
|
"r_x0": 73.34702132031646,
|
||||||
"r_y0": 99.33333333333333,
|
"r_y0": 97.99999977896755,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 503.64955224479564,
|
||||||
"r_y1": 99.33333333333333,
|
"r_y1": 97.99999977896755,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 503.64955224479564,
|
||||||
"r_y2": 74.66666666666667,
|
"r_y2": 76.99999977896756,
|
||||||
"r_x3": 71.33333333333333,
|
"r_x3": 73.34702132031646,
|
||||||
"r_y3": 74.66666666666667,
|
"r_y3": 76.99999977896756,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "Docling bundles PDF document conversion to",
|
"text": "Docling bundles PDF document conversion to",
|
||||||
"orig": "Docling bundles PDF document conversion to",
|
"orig": "Docling bundles PDF document conversion to",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9555703127793324,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -335,20 +335,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 69.0,
|
"r_x0": 70.90211866351085,
|
||||||
"r_y0": 126.66666666666667,
|
"r_y0": 124.83139551297342,
|
||||||
"r_x1": 506.6666666666667,
|
"r_x1": 504.8720079864275,
|
||||||
"r_y1": 126.66666666666667,
|
"r_y1": 124.83139551297342,
|
||||||
"r_x2": 506.6666666666667,
|
"r_x2": 504.8720079864275,
|
||||||
"r_y2": 100.66666666666667,
|
"r_y2": 102.66666671251768,
|
||||||
"r_x3": 69.0,
|
"r_x3": 70.90211866351085,
|
||||||
"r_y3": 100.66666666666667,
|
"r_y3": 102.66666671251768,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "JSON and Markdown in an easy self contained",
|
"text": "JSON and Markdown in an easy self contained",
|
||||||
"orig": "JSON and Markdown in an easy self contained",
|
"orig": "JSON and Markdown in an easy self contained",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.9741098171752292,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -360,20 +360,20 @@
|
|||||||
"a": 255
|
"a": 255
|
||||||
},
|
},
|
||||||
"rect": {
|
"rect": {
|
||||||
"r_x0": 70.66666666666667,
|
"r_x0": 73.10852522817731,
|
||||||
"r_y0": 153.33333333333334,
|
"r_y0": 152.70503335218433,
|
||||||
"r_x1": 154.0,
|
"r_x1": 153.04479435252625,
|
||||||
"r_y1": 153.33333333333334,
|
"r_y1": 152.70503335218433,
|
||||||
"r_x2": 154.0,
|
"r_x2": 153.04479435252625,
|
||||||
"r_y2": 128.66666666666666,
|
"r_y2": 130.00136157890958,
|
||||||
"r_x3": 70.66666666666667,
|
"r_x3": 73.10852522817731,
|
||||||
"r_y3": 128.66666666666666,
|
"r_y3": 130.00136157890958,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"text": "package",
|
"text": "package",
|
||||||
"orig": "package",
|
"orig": "package",
|
||||||
"text_direction": "left_to_right",
|
"text_direction": "left_to_right",
|
||||||
"confidence": 0.6702765056141881,
|
"confidence": 1.0,
|
||||||
"from_ocr": true
|
"from_ocr": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
3
tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt
vendored
Normal file
3
tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.doctags.txt
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
<document>
|
||||||
|
<paragraph><location><page_1><loc_16><loc_12><loc_18><loc_26></location>package</paragraph>
|
||||||
|
</document>
|
1
tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json
vendored
Normal file
1
tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.json
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test_rotated.pdf", "filename-prov": null, "document-hash": "4a282813d93824eaa9bc2a0b2a0d6d626ecc8f5f380bd1320e2dd3e8e53c2ba6", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "f8a4dc72d8b159f69d0bc968b97f3fb9e0ac59dcb3113492432755835935d9b3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [131.21306574279092, 74.12495603322407, 152.19606490864376, 154.19400205373182], "page": 1, "span": [0, 7], "__ref_s3_data": null}], "text": "package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 595.201171875, "page": 1, "width": 841.9216918945312}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
1
tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md
vendored
Normal file
1
tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated.md
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
package
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user