mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
merged with main
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
commit
fc61258273
23
CHANGELOG.md
23
CHANGELOG.md
@ -1,3 +1,26 @@
|
|||||||
|
## [v2.32.0](https://github.com/docling-project/docling/releases/tag/v2.32.0) - 2025-05-14
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Improve parallelization for remote services API calls ([#1548](https://github.com/docling-project/docling/issues/1548)) ([`3a04f2a`](https://github.com/docling-project/docling/commit/3a04f2a367e32913f91faa2325f928b85112e632))
|
||||||
|
* Support image/webp file type ([#1415](https://github.com/docling-project/docling/issues/1415)) ([`12dab0a`](https://github.com/docling-project/docling/commit/12dab0a1e8d181d99e4711ffdbbc33d158234fb4))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* **ocr:** Orig field in TesseractOcrCliModel as str ([#1553](https://github.com/docling-project/docling/issues/1553)) ([`9f8b479`](https://github.com/docling-project/docling/commit/9f8b479f17bbfaf79c3c897980ad15742ec86568))
|
||||||
|
* **settings:** Fix nested settings load via environment variables ([#1551](https://github.com/docling-project/docling/issues/1551)) ([`2efb7a7`](https://github.com/docling-project/docling/commit/2efb7a7c06a8e51516cc9b93e5dbcdea69f562fa))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Add advanced chunking & serialization example ([#1589](https://github.com/docling-project/docling/issues/1589)) ([`9f28abf`](https://github.com/docling-project/docling/commit/9f28abf0610560645b40352dfdfc3525fa86c28d))
|
||||||
|
|
||||||
|
## [v2.31.2](https://github.com/docling-project/docling/releases/tag/v2.31.2) - 2025-05-13
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* AsciiDoc header identification (#1562) ([#1563](https://github.com/docling-project/docling/issues/1563)) ([`4046d0b`](https://github.com/docling-project/docling/commit/4046d0b2f38254679de5fc78aaf2fe630d6bb61c))
|
||||||
|
* Restrict click version and update lock file ([#1582](https://github.com/docling-project/docling/issues/1582)) ([`8baa85a`](https://github.com/docling-project/docling/commit/8baa85a49d3a456d198c52aac8e0b4ac70c92e72))
|
||||||
|
|
||||||
## [v2.31.1](https://github.com/docling-project/docling/releases/tag/v2.31.1) - 2025-05-12
|
## [v2.31.1](https://github.com/docling-project/docling/releases/tag/v2.31.1) - 2025-05-12
|
||||||
|
|
||||||
### Fix
|
### Fix
|
||||||
|
@ -287,7 +287,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# ========= Section headers
|
# ========= Section headers
|
||||||
def _is_section_header(self, line):
|
def _is_section_header(self, line):
|
||||||
return re.match(r"^==+", line)
|
return re.match(r"^==+\s+", line)
|
||||||
|
|
||||||
def _parse_section_header(self, line):
|
def _parse_section_header(self, line):
|
||||||
match = re.match(r"^(=+)\s+(.*)", line)
|
match = re.match(r"^(=+)\s+(.*)", line)
|
||||||
|
@ -90,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
"image/tiff",
|
"image/tiff",
|
||||||
"image/gif",
|
"image/gif",
|
||||||
"image/bmp",
|
"image/bmp",
|
||||||
|
"image/webp",
|
||||||
],
|
],
|
||||||
InputFormat.PDF: ["application/pdf"],
|
InputFormat.PDF: ["application/pdf"],
|
||||||
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
||||||
|
@ -229,6 +229,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|||||||
headers: Dict[str, str] = {}
|
headers: Dict[str, str] = {}
|
||||||
params: Dict[str, Any] = {}
|
params: Dict[str, Any] = {}
|
||||||
timeout: float = 20
|
timeout: float = 20
|
||||||
|
concurrency: int = 1
|
||||||
|
|
||||||
prompt: str = "Describe this image in a few sentences."
|
prompt: str = "Describe this image in a few sentences."
|
||||||
provenance: str = ""
|
provenance: str = ""
|
||||||
|
@ -56,13 +56,15 @@ class DebugSettings(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class AppSettings(BaseSettings):
|
class AppSettings(BaseSettings):
|
||||||
model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
|
model_config = SettingsConfigDict(
|
||||||
|
env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
|
||||||
|
)
|
||||||
|
|
||||||
perf: BatchConcurrencySettings
|
perf: BatchConcurrencySettings = BatchConcurrencySettings()
|
||||||
debug: DebugSettings
|
debug: DebugSettings = DebugSettings()
|
||||||
|
|
||||||
cache_dir: Path = Path.home() / ".cache" / "docling"
|
cache_dir: Path = Path.home() / ".cache" / "docling"
|
||||||
artifacts_path: Optional[Path] = None
|
artifacts_path: Optional[Path] = None
|
||||||
|
|
||||||
|
|
||||||
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
settings = AppSettings()
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -27,6 +28,7 @@ class ApiVlmModel(BasePageModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.timeout = self.vlm_options.timeout
|
self.timeout = self.vlm_options.timeout
|
||||||
|
self.concurrency = self.vlm_options.concurrency
|
||||||
self.prompt_content = (
|
self.prompt_content = (
|
||||||
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
||||||
)
|
)
|
||||||
@ -38,10 +40,10 @@ class ApiVlmModel(BasePageModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
for page in page_batch:
|
def _vlm_request(page):
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
return page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "vlm"):
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
@ -63,4 +65,7 @@ class ApiVlmModel(BasePageModel):
|
|||||||
|
|
||||||
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
||||||
|
|
||||||
yield page
|
return page
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
||||||
|
yield from executor.map(_vlm_request, page_batch)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Type, Union
|
from typing import Optional, Type, Union
|
||||||
|
|
||||||
@ -37,6 +38,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
accelerator_options=accelerator_options,
|
accelerator_options=accelerator_options,
|
||||||
)
|
)
|
||||||
self.options: PictureDescriptionApiOptions
|
self.options: PictureDescriptionApiOptions
|
||||||
|
self.concurrency = self.options.concurrency
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
if not enable_remote_services:
|
if not enable_remote_services:
|
||||||
@ -48,8 +50,8 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||||
# Note: technically we could make a batch request here,
|
# Note: technically we could make a batch request here,
|
||||||
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
||||||
for image in images:
|
def _api_request(image):
|
||||||
yield api_image_request(
|
return api_image_request(
|
||||||
image=image,
|
image=image,
|
||||||
prompt=self.options.prompt,
|
prompt=self.options.prompt,
|
||||||
url=self.options.url,
|
url=self.options.url,
|
||||||
@ -57,3 +59,6 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|||||||
headers=self.options.headers,
|
headers=self.options.headers,
|
||||||
**self.options.params,
|
**self.options.params,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
||||||
|
yield from executor.map(_api_request, images)
|
||||||
|
@ -249,7 +249,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
cell = TextCell(
|
cell = TextCell(
|
||||||
index=ix,
|
index=ix,
|
||||||
text=str(text),
|
text=str(text),
|
||||||
orig=text,
|
orig=str(text),
|
||||||
from_ocr=True,
|
from_ocr=True,
|
||||||
confidence=conf / 100.0,
|
confidence=conf / 100.0,
|
||||||
rect=BoundingRectangle.from_bounding_box(
|
rect=BoundingRectangle.from_bounding_box(
|
||||||
|
@ -71,7 +71,10 @@ tokens), &
|
|||||||
chunks with same headings & captions) — users can opt out of this step via param
|
chunks with same headings & captions) — users can opt out of this step via param
|
||||||
`merge_peers` (by default `True`)
|
`merge_peers` (by default `True`)
|
||||||
|
|
||||||
👉 Example: see [here](../examples/hybrid_chunking.ipynb).
|
👉 Usage examples:
|
||||||
|
|
||||||
|
- [Hybrid chunking](../examples/hybrid_chunking.ipynb)
|
||||||
|
- [Advanced chunking & serialization](../examples/advanced_chunking_and_serialization.ipynb)
|
||||||
|
|
||||||
## Hierarchical Chunker
|
## Hierarchical Chunker
|
||||||
|
|
||||||
|
559
docs/examples/advanced_chunking_and_serialization.ipynb
Normal file
559
docs/examples/advanced_chunking_and_serialization.ipynb
Normal file
@ -0,0 +1,559 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Advanced chunking & serialization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Overview"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In this notebook we show how to customize the serialization strategies that come into\n",
|
||||||
|
"play during chunking."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setup"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We will work with a document that contains some [picture annotations](../pictures_description):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.types.doc.document import DoclingDocument\n",
|
||||||
|
"\n",
|
||||||
|
"SOURCE = \"https://github.com/docling-project/docling/raw/refs/heads/main/docs/examples/data/2408.09869v3_enriched.json\"\n",
|
||||||
|
"\n",
|
||||||
|
"doc = DoclingDocument.load_from_json(SOURCE)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we define the chunker (for more details check out [Hybrid Chunking](../hybrid_chunking)):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.chunker.hybrid_chunker import HybridChunker\n",
|
||||||
|
"from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer\n",
|
||||||
|
"from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n",
|
||||||
|
"from transformers import AutoTokenizer\n",
|
||||||
|
"\n",
|
||||||
|
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||||
|
"\n",
|
||||||
|
"tokenizer: BaseTokenizer = HuggingFaceTokenizer(\n",
|
||||||
|
" tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),\n",
|
||||||
|
")\n",
|
||||||
|
"chunker = HybridChunker(tokenizer=tokenizer)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"tokenizer.get_max_tokens()=512\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(f\"{tokenizer.get_max_tokens()=}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Defining some helper methods:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Iterable, Optional\n",
|
||||||
|
"\n",
|
||||||
|
"from docling_core.transforms.chunker.base import BaseChunk\n",
|
||||||
|
"from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n",
|
||||||
|
"from docling_core.types.doc.labels import DocItemLabel\n",
|
||||||
|
"from rich.console import Console\n",
|
||||||
|
"from rich.panel import Panel\n",
|
||||||
|
"\n",
|
||||||
|
"console = Console(\n",
|
||||||
|
" width=200, # for getting Markdown tables rendered nicely\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def find_n_th_chunk_with_label(\n",
|
||||||
|
" iter: Iterable[BaseChunk], n: int, label: DocItemLabel\n",
|
||||||
|
") -> Optional[DocChunk]:\n",
|
||||||
|
" num_found = -1\n",
|
||||||
|
" for i, chunk in enumerate(iter):\n",
|
||||||
|
" doc_chunk = DocChunk.model_validate(chunk)\n",
|
||||||
|
" for it in doc_chunk.meta.doc_items:\n",
|
||||||
|
" if it.label == label:\n",
|
||||||
|
" num_found += 1\n",
|
||||||
|
" if num_found == n:\n",
|
||||||
|
" return i, chunk\n",
|
||||||
|
" return None, None\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def print_chunk(chunks, chunk_pos):\n",
|
||||||
|
" chunk = chunks[chunk_pos]\n",
|
||||||
|
" ctx_text = chunker.contextualize(chunk=chunk)\n",
|
||||||
|
" num_tokens = tokenizer.count_tokens(text=ctx_text)\n",
|
||||||
|
" doc_items_refs = [it.self_ref for it in chunk.meta.doc_items]\n",
|
||||||
|
" title = f\"{chunk_pos=} {num_tokens=} {doc_items_refs=}\"\n",
|
||||||
|
" console.print(Panel(ctx_text, title=title))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Table serialization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using the default strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we inspect the first chunk containing a table — using the default serialization strategy:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, │\n",
|
||||||
|
"│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n",
|
||||||
|
"│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16 │\n",
|
||||||
|
"│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium │\n",
|
||||||
|
"│ backend.Mem = 2.42 GB │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, │\n",
|
||||||
|
"│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n",
|
||||||
|
"│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16 │\n",
|
||||||
|
"│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium │\n",
|
||||||
|
"│ backend.Mem = 2.42 GB │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chunker = HybridChunker(tokenizer=tokenizer)\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-info\">\n",
|
||||||
|
" <strong>INFO</strong>: As you see above, using the <code>HybridChunker</code> can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check <a href=\"https://docling-project.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model\">here</a>.\n",
|
||||||
|
"</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Configuring a different strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can configure a different serialization strategy. In the example below, we specify a different table serializer that serializes tables to Markdown instead of the triplet notation used by default:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | │\n",
|
||||||
|
"│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| │\n",
|
||||||
|
"│ | | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | │\n",
|
||||||
|
"│ | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | │\n",
|
||||||
|
"│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ 4 Performance │\n",
|
||||||
|
"│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
|
||||||
|
"│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n",
|
||||||
|
"│ │\n",
|
||||||
|
"│ | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | │\n",
|
||||||
|
"│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| │\n",
|
||||||
|
"│ | | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | │\n",
|
||||||
|
"│ | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | │\n",
|
||||||
|
"│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.chunker.hierarchical_chunker import (\n",
|
||||||
|
" ChunkingDocSerializer,\n",
|
||||||
|
" ChunkingSerializerProvider,\n",
|
||||||
|
")\n",
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownTableSerializer\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class MDTableSerializerProvider(ChunkingSerializerProvider):\n",
|
||||||
|
" def get_serializer(self, doc):\n",
|
||||||
|
" return ChunkingDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" table_serializer=MarkdownTableSerializer(), # configuring a different table serializer\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HybridChunker(\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" serializer_provider=MDTableSerializerProvider(),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Picture serialization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using the default strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we inspect the first chunk containing a picture.\n",
|
||||||
|
"\n",
|
||||||
|
"Even when using the default strategy, we can modify the relevant parameters, e.g. which placeholder is used for pictures:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ <!-- image --> │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ <!-- image --> │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownParams\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class ImgPlaceholderSerializerProvider(ChunkingSerializerProvider):\n",
|
||||||
|
" def get_serializer(self, doc):\n",
|
||||||
|
" return ChunkingDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" params=MarkdownParams(\n",
|
||||||
|
" image_placeholder=\"<!-- image -->\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HybridChunker(\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" serializer_provider=ImgPlaceholderSerializerProvider(),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using a custom strategy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Below we define and use our custom picture serialization strategy which leverages picture annotations:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Any\n",
|
||||||
|
"\n",
|
||||||
|
"from docling_core.transforms.serializer.base import (\n",
|
||||||
|
" BaseDocSerializer,\n",
|
||||||
|
" SerializationResult,\n",
|
||||||
|
")\n",
|
||||||
|
"from docling_core.transforms.serializer.common import create_ser_result\n",
|
||||||
|
"from docling_core.transforms.serializer.markdown import MarkdownPictureSerializer\n",
|
||||||
|
"from docling_core.types.doc.document import (\n",
|
||||||
|
" PictureClassificationData,\n",
|
||||||
|
" PictureDescriptionData,\n",
|
||||||
|
" PictureItem,\n",
|
||||||
|
" PictureMoleculeData,\n",
|
||||||
|
")\n",
|
||||||
|
"from typing_extensions import override\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"class AnnotationPictureSerializer(MarkdownPictureSerializer):\n",
|
||||||
|
" @override\n",
|
||||||
|
" def serialize(\n",
|
||||||
|
" self,\n",
|
||||||
|
" *,\n",
|
||||||
|
" item: PictureItem,\n",
|
||||||
|
" doc_serializer: BaseDocSerializer,\n",
|
||||||
|
" doc: DoclingDocument,\n",
|
||||||
|
" **kwargs: Any,\n",
|
||||||
|
" ) -> SerializationResult:\n",
|
||||||
|
" text_parts: list[str] = []\n",
|
||||||
|
" for annotation in item.annotations:\n",
|
||||||
|
" if isinstance(annotation, PictureClassificationData):\n",
|
||||||
|
" predicted_class = (\n",
|
||||||
|
" annotation.predicted_classes[0].class_name\n",
|
||||||
|
" if annotation.predicted_classes\n",
|
||||||
|
" else None\n",
|
||||||
|
" )\n",
|
||||||
|
" if predicted_class is not None:\n",
|
||||||
|
" text_parts.append(f\"Picture type: {predicted_class}\")\n",
|
||||||
|
" elif isinstance(annotation, PictureMoleculeData):\n",
|
||||||
|
" text_parts.append(f\"SMILES: {annotation.smi}\")\n",
|
||||||
|
" elif isinstance(annotation, PictureDescriptionData):\n",
|
||||||
|
" text_parts.append(f\"Picture description: {annotation.text}\")\n",
|
||||||
|
"\n",
|
||||||
|
" text_res = \"\\n\".join(text_parts)\n",
|
||||||
|
" text_res = doc_serializer.post_process(text=text_res)\n",
|
||||||
|
" return create_ser_result(text=text_res, span_source=item)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ Picture description: In this image we can see a cartoon image of a duck holding a paper. │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
|
||||||
|
"</pre>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
|
||||||
|
"│ Docling Technical Report │\n",
|
||||||
|
"│ Picture description: In this image we can see a cartoon image of a duck holding a paper. │\n",
|
||||||
|
"│ Version 1.0 │\n",
|
||||||
|
"│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n",
|
||||||
|
"│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n",
|
||||||
|
"│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n",
|
||||||
|
"╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"class ImgAnnotationSerializerProvider(ChunkingSerializerProvider):\n",
|
||||||
|
" def get_serializer(self, doc: DoclingDocument):\n",
|
||||||
|
" return ChunkingDocSerializer(\n",
|
||||||
|
" doc=doc,\n",
|
||||||
|
" picture_serializer=AnnotationPictureSerializer(), # configuring a different picture serializer\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"chunker = HybridChunker(\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" serializer_provider=ImgAnnotationSerializerProvider(),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chunk_iter = chunker.chunk(dl_doc=doc)\n",
|
||||||
|
"\n",
|
||||||
|
"chunks = list(chunk_iter)\n",
|
||||||
|
"i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n",
|
||||||
|
"print_chunk(\n",
|
||||||
|
" chunks=chunks,\n",
|
||||||
|
" chunk_pos=i,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
45197
docs/examples/data/2408.09869v3_enriched.json
Normal file
45197
docs/examples/data/2408.09869v3_enriched.json
Normal file
File diff suppressed because one or more lines are too long
@ -410,23 +410,6 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Configuring serialization"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"We can additionally customize the serialization strategy via a user-provided\n",
|
|
||||||
"[serializer provider](../../concepts/serialization).\n",
|
|
||||||
"\n",
|
|
||||||
"For usage examples check out [this notebook](https://github.com/docling-project/docling-core/blob/main/examples/chunking_and_serialization.ipynb)."
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||||
|
|
||||||
converter = DocumentConverter()
|
converter = DocumentConverter()
|
||||||
result = converter.convert(source)
|
doc = converter.convert(source).document
|
||||||
print(result.document.export_to_markdown())
|
|
||||||
|
print(doc.export_to_markdown())
|
||||||
# output: ## Docling Technical Report [...]"
|
# output: ## Docling Technical Report [...]"
|
||||||
|
@ -14,7 +14,7 @@ Below you can find a listing of all supported input and output formats.
|
|||||||
| AsciiDoc | |
|
| AsciiDoc | |
|
||||||
| HTML, XHTML | |
|
| HTML, XHTML | |
|
||||||
| CSV | |
|
| CSV | |
|
||||||
| PNG, JPEG, TIFF, BMP | Image formats |
|
| PNG, JPEG, TIFF, BMP, WEBP | Image formats |
|
||||||
|
|
||||||
Schema-specific support:
|
Schema-specific support:
|
||||||
|
|
||||||
|
@ -88,10 +88,10 @@ nav:
|
|||||||
- "Simple translation": examples/translate.py
|
- "Simple translation": examples/translate.py
|
||||||
- examples/backend_csv.ipynb
|
- examples/backend_csv.ipynb
|
||||||
- examples/backend_xml_rag.ipynb
|
- examples/backend_xml_rag.ipynb
|
||||||
- 📤 Serialization:
|
- ✂️ Serialization & chunking:
|
||||||
- examples/serialization.ipynb
|
- examples/serialization.ipynb
|
||||||
- ✂️ Chunking:
|
|
||||||
- examples/hybrid_chunking.ipynb
|
- examples/hybrid_chunking.ipynb
|
||||||
|
- examples/advanced_chunking_and_serialization.ipynb
|
||||||
- 🤖 RAG with AI dev frameworks:
|
- 🤖 RAG with AI dev frameworks:
|
||||||
- examples/rag_haystack.ipynb
|
- examples/rag_haystack.ipynb
|
||||||
- examples/rag_langchain.ipynb
|
- examples/rag_langchain.ipynb
|
||||||
|
115
poetry.lock
generated
115
poetry.lock
generated
@ -683,7 +683,7 @@ description = "Composable command line interface toolkit"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
groups = ["main", "dev", "docs"]
|
groups = ["main", "dev", "docs"]
|
||||||
markers = "sys_platform != \"darwin\" and python_version == \"3.9\" or platform_machine == \"aarch64\" and python_version < \"3.10\" and platform_system == \"Linux\" or platform_machine != \"x86_64\" and python_version == \"3.9\" or sys_platform == \"darwin\" and platform_machine == \"x86_64\" and python_version < \"3.10\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
|
{file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
|
||||||
{file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
|
{file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
|
||||||
@ -692,22 +692,6 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "click"
|
|
||||||
version = "8.2.0"
|
|
||||||
description = "Composable command line interface toolkit"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.10"
|
|
||||||
groups = ["main", "dev", "docs"]
|
|
||||||
markers = "python_version >= \"3.10\""
|
|
||||||
files = [
|
|
||||||
{file = "click-8.2.0-py3-none-any.whl", hash = "sha256:6b303f0b2aa85f1cb4e5303078fadcbcd4e476f114fab9b5007005711839325c"},
|
|
||||||
{file = "click-8.2.0.tar.gz", hash = "sha256:f5452aeddd9988eefa20f90f05ab66f17fce1ee2a36907fd30b05bbb5953814d"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "click-log"
|
name = "click-log"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
@ -1108,15 +1092,15 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-core"
|
name = "docling-core"
|
||||||
version = "2.30.0"
|
version = "2.30.1"
|
||||||
description = "A python library to define and validate data types in Docling."
|
description = "A python library to define and validate data types in Docling."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "<4.0,>=3.9"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_core-2.30.0-py3-none-any.whl", hash = "sha256:976648b95b44d1a0f5540b949a38aee284e9f078b4411385f8a31cd50b17573e"},
|
{file = "docling_core-2.30.1-py3-none-any.whl", hash = "sha256:28f39a8745ef36fffc2de2b9e414e42e34e7b8e9610864daaae40fc74c61edef"},
|
||||||
{file = "docling_core-2.30.0.tar.gz", hash = "sha256:5f064f80584803ae1e7b2818d6977ef2df7cd2e20168c7065443b07cc9de435f"},
|
{file = "docling_core-2.30.1.tar.gz", hash = "sha256:1620367f8fad976625bb381dce14d70cb50491a07b643f5025ec525df7713599"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -1758,28 +1742,6 @@ files = [
|
|||||||
{file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"},
|
{file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "hf-xet"
|
|
||||||
version = "1.1.0"
|
|
||||||
description = ""
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.8"
|
|
||||||
groups = ["main", "examples", "lm"]
|
|
||||||
markers = "platform_machine == \"x86_64\" or platform_machine == \"aarch64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or python_version >= \"3.10\" and (platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\")"
|
|
||||||
files = [
|
|
||||||
{file = "hf_xet-1.1.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0322c42551e275fcb7949c083a54a81b2898e50787c9aa74284fcb8d2c58c12c"},
|
|
||||||
{file = "hf_xet-1.1.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:667153a0304ac2debf2af95a8ff7687186f885b493f4cd16344869af270cd110"},
|
|
||||||
{file = "hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995eeffb119636ea617b96c7d7bf3c3f5ea8727fa57974574e25d700b8532d48"},
|
|
||||||
{file = "hf_xet-1.1.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3aee847da362393331f515c4010d0aaa1c2669acfcca1f4b28946d6949cc0086"},
|
|
||||||
{file = "hf_xet-1.1.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:68c5813a6074aa36e12ef5983230e3b03148cce61e0fcdd294096493795565b4"},
|
|
||||||
{file = "hf_xet-1.1.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4ee9222bf9274b1c198b88a929de0b5a49349c4962d89c5b3b2f0f7f47d9761c"},
|
|
||||||
{file = "hf_xet-1.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:73153eab9abf3d6973b21e94a67ccba5d595c3e12feb8c0bf50be02964e7f126"},
|
|
||||||
{file = "hf_xet-1.1.0.tar.gz", hash = "sha256:a7c2a4c2b6eee9ce0a1a367a82b60d95ba634420ef1c250addad7aa4af419cf4"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
tests = ["pytest"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "httpcore"
|
name = "httpcore"
|
||||||
version = "1.0.9"
|
version = "1.0.9"
|
||||||
@ -1831,21 +1793,20 @@ zstd = ["zstandard (>=0.18.0)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "huggingface-hub"
|
name = "huggingface-hub"
|
||||||
version = "0.31.1"
|
version = "0.31.2"
|
||||||
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8.0"
|
python-versions = ">=3.8.0"
|
||||||
groups = ["main", "examples", "lm"]
|
groups = ["main", "examples", "lm"]
|
||||||
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "huggingface_hub-0.31.1-py3-none-any.whl", hash = "sha256:43f73124819b48b42d140cbc0d7a2e6bd15b2853b1b9d728d4d55ad1750cac5b"},
|
{file = "huggingface_hub-0.31.2-py3-none-any.whl", hash = "sha256:8138cd52aa2326b4429bb00a4a1ba8538346b7b8a808cdce30acb6f1f1bdaeec"},
|
||||||
{file = "huggingface_hub-0.31.1.tar.gz", hash = "sha256:492bb5f545337aa9e2f59b75ef4c5f535a371e8958a6ce90af056387e67f1180"},
|
{file = "huggingface_hub-0.31.2.tar.gz", hash = "sha256:7053561376ed7f6ffdaecf09cc54d70dc784ac6315fa4bb9b93e19662b029675"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
filelock = "*"
|
filelock = "*"
|
||||||
fsspec = ">=2023.5.0"
|
fsspec = ">=2023.5.0"
|
||||||
hf-xet = {version = ">=1.1.0,<2.0.0", markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""}
|
|
||||||
packaging = ">=20.9"
|
packaging = ">=20.9"
|
||||||
pyyaml = ">=5.1"
|
pyyaml = ">=5.1"
|
||||||
requests = "*"
|
requests = "*"
|
||||||
@ -1858,7 +1819,7 @@ cli = ["InquirerPy (==0.3.4)"]
|
|||||||
dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
|
dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
|
||||||
fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
|
fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
|
||||||
hf-transfer = ["hf-transfer (>=0.1.4)"]
|
hf-transfer = ["hf-transfer (>=0.1.4)"]
|
||||||
hf-xet = ["hf-xet (>=1.1.0,<2.0.0)"]
|
hf-xet = ["hf-xet (>=1.1.1,<2.0.0)"]
|
||||||
inference = ["aiohttp"]
|
inference = ["aiohttp"]
|
||||||
quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"]
|
quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"]
|
||||||
tensorflow = ["graphviz", "pydot", "tensorflow"]
|
tensorflow = ["graphviz", "pydot", "tensorflow"]
|
||||||
@ -3239,15 +3200,15 @@ pygments = ">2.12.0"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mkdocs-material"
|
name = "mkdocs-material"
|
||||||
version = "9.6.13"
|
version = "9.6.14"
|
||||||
description = "Documentation that simply works"
|
description = "Documentation that simply works"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
groups = ["docs"]
|
groups = ["docs"]
|
||||||
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "mkdocs_material-9.6.13-py3-none-any.whl", hash = "sha256:3730730314e065f422cc04eacbc8c6084530de90f4654a1482472283a38e30d3"},
|
{file = "mkdocs_material-9.6.14-py3-none-any.whl", hash = "sha256:3b9cee6d3688551bf7a8e8f41afda97a3c39a12f0325436d76c86706114b721b"},
|
||||||
{file = "mkdocs_material-9.6.13.tar.gz", hash = "sha256:7bde7ebf33cfd687c1c86c08ed8f6470d9a5ba737bd89e7b3e5d9f94f8c72c16"},
|
{file = "mkdocs_material-9.6.14.tar.gz", hash = "sha256:39d795e90dce6b531387c255bd07e866e027828b7346d3eba5ac3de265053754"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -5176,21 +5137,21 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "protobuf"
|
name = "protobuf"
|
||||||
version = "6.30.2"
|
version = "6.31.0"
|
||||||
description = ""
|
description = ""
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
groups = ["main", "examples"]
|
groups = ["main", "examples"]
|
||||||
files = [
|
files = [
|
||||||
{file = "protobuf-6.30.2-cp310-abi3-win32.whl", hash = "sha256:b12ef7df7b9329886e66404bef5e9ce6a26b54069d7f7436a0853ccdeb91c103"},
|
{file = "protobuf-6.31.0-cp310-abi3-win32.whl", hash = "sha256:10bd62802dfa0588649740a59354090eaf54b8322f772fbdcca19bc78d27f0d6"},
|
||||||
{file = "protobuf-6.30.2-cp310-abi3-win_amd64.whl", hash = "sha256:7653c99774f73fe6b9301b87da52af0e69783a2e371e8b599b3e9cb4da4b12b9"},
|
{file = "protobuf-6.31.0-cp310-abi3-win_amd64.whl", hash = "sha256:3e987c99fd634be8347246a02123250f394ba20573c953de133dc8b2c107dd71"},
|
||||||
{file = "protobuf-6.30.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:0eb523c550a66a09a0c20f86dd554afbf4d32b02af34ae53d93268c1f73bc65b"},
|
{file = "protobuf-6.31.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:2c812f0f96ceb6b514448cefeb1df54ec06dde456783f5099c0e2f8a0f2caa89"},
|
||||||
{file = "protobuf-6.30.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:50f32cc9fd9cb09c783ebc275611b4f19dfdfb68d1ee55d2f0c7fa040df96815"},
|
{file = "protobuf-6.31.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:67ce50195e4e584275623b8e6bc6d3d3dfd93924bf6116b86b3b8975ab9e4571"},
|
||||||
{file = "protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4f6c687ae8efae6cf6093389a596548214467778146b7245e886f35e1485315d"},
|
{file = "protobuf-6.31.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:5353e38844168a327acd2b2aa440044411cd8d1b6774d5701008bd1dba067c79"},
|
||||||
{file = "protobuf-6.30.2-cp39-cp39-win32.whl", hash = "sha256:524afedc03b31b15586ca7f64d877a98b184f007180ce25183d1a5cb230ee72b"},
|
{file = "protobuf-6.31.0-cp39-cp39-win32.whl", hash = "sha256:96d8da25c83b11db5fe9e0376351ce25e7205e13224d939e097b6f82a72af824"},
|
||||||
{file = "protobuf-6.30.2-cp39-cp39-win_amd64.whl", hash = "sha256:acec579c39c88bd8fbbacab1b8052c793efe83a0a5bd99db4a31423a25c0a0e2"},
|
{file = "protobuf-6.31.0-cp39-cp39-win_amd64.whl", hash = "sha256:00a873c06efdfb854145d9ded730b09cf57d206075c38132674093370e2edabb"},
|
||||||
{file = "protobuf-6.30.2-py3-none-any.whl", hash = "sha256:ae86b030e69a98e08c77beab574cbcb9fff6d031d57209f574a5aea1445f4b51"},
|
{file = "protobuf-6.31.0-py3-none-any.whl", hash = "sha256:6ac2e82556e822c17a8d23aa1190bbc1d06efb9c261981da95c71c9da09e9e23"},
|
||||||
{file = "protobuf-6.30.2.tar.gz", hash = "sha256:35c859ae076d8c56054c25b59e5e59638d86545ed6e2b6efac6be0b6ea3ba048"},
|
{file = "protobuf-6.31.0.tar.gz", hash = "sha256:314fab1a6a316469dc2dd46f993cbbe95c861ea6807da910becfe7475bc26ffe"},
|
||||||
]
|
]
|
||||||
markers = {main = "extra == \"rapidocr\"", examples = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""}
|
markers = {main = "extra == \"rapidocr\"", examples = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""}
|
||||||
|
|
||||||
@ -6278,15 +6239,15 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyyaml-env-tag"
|
name = "pyyaml-env-tag"
|
||||||
version = "1.0"
|
version = "1.1"
|
||||||
description = "A custom YAML tag for referencing environment variables in YAML files."
|
description = "A custom YAML tag for referencing environment variables in YAML files."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
groups = ["docs"]
|
groups = ["docs"]
|
||||||
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "pyyaml_env_tag-1.0-py3-none-any.whl", hash = "sha256:37f081041b8dca44ed8eb931ce0056f97de17251450f0ed08773dc2bcaf9e683"},
|
{file = "pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04"},
|
||||||
{file = "pyyaml_env_tag-1.0.tar.gz", hash = "sha256:bc952534a872b583f66f916e2dd83e7a7b9087847f4afca6d9c957c48b258ed2"},
|
{file = "pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -7176,14 +7137,14 @@ train = ["accelerate (>=0.20.3)", "datasets"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "setuptools"
|
name = "setuptools"
|
||||||
version = "80.4.0"
|
version = "80.7.1"
|
||||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
groups = ["main", "examples", "lm", "mac_intel"]
|
groups = ["main", "examples", "lm", "mac_intel"]
|
||||||
files = [
|
files = [
|
||||||
{file = "setuptools-80.4.0-py3-none-any.whl", hash = "sha256:6cdc8cb9a7d590b237dbe4493614a9b75d0559b888047c1f67d49ba50fc3edb2"},
|
{file = "setuptools-80.7.1-py3-none-any.whl", hash = "sha256:ca5cc1069b85dc23070a6628e6bcecb3292acac802399c7f8edc0100619f9009"},
|
||||||
{file = "setuptools-80.4.0.tar.gz", hash = "sha256:5a78f61820bc088c8e4add52932ae6b8cf423da2aff268c23f813cfbb13b4006"},
|
{file = "setuptools-80.7.1.tar.gz", hash = "sha256:f6ffc5f0142b1bd8d0ca94ee91b30c0ca862ffd50826da1ea85258a06fd94552"},
|
||||||
]
|
]
|
||||||
markers = {main = "sys_platform != \"darwin\" and platform_system == \"Linux\" and platform_machine == \"x86_64\" or sys_platform != \"darwin\" and python_version >= \"3.12\" or platform_machine != \"x86_64\" and python_version >= \"3.12\"", examples = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\"", lm = "sys_platform != \"darwin\" and platform_system == \"Linux\" and platform_machine == \"x86_64\" or sys_platform != \"darwin\" and python_version >= \"3.12\" or platform_machine != \"x86_64\" and python_version >= \"3.12\"", mac_intel = "sys_platform != \"darwin\" and platform_system == \"Linux\" and platform_machine == \"x86_64\" or sys_platform != \"darwin\" and python_version >= \"3.12\" or platform_machine != \"x86_64\" and python_version >= \"3.12\""}
|
markers = {main = "sys_platform != \"darwin\" and platform_system == \"Linux\" and platform_machine == \"x86_64\" or sys_platform != \"darwin\" and python_version >= \"3.12\" or platform_machine != \"x86_64\" and python_version >= \"3.12\"", examples = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\"", lm = "sys_platform != \"darwin\" and platform_system == \"Linux\" and platform_machine == \"x86_64\" or sys_platform != \"darwin\" and python_version >= \"3.12\" or platform_machine != \"x86_64\" and python_version >= \"3.12\"", mac_intel = "sys_platform != \"darwin\" and platform_system == \"Linux\" and platform_machine == \"x86_64\" or sys_platform != \"darwin\" and python_version >= \"3.12\" or platform_machine != \"x86_64\" and python_version >= \"3.12\""}
|
||||||
|
|
||||||
@ -8287,19 +8248,19 @@ urllib3 = ">=1.26.0"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typer"
|
name = "typer"
|
||||||
version = "0.15.3"
|
version = "0.15.4"
|
||||||
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
|
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "typer-0.15.3-py3-none-any.whl", hash = "sha256:c86a65ad77ca531f03de08d1b9cb67cd09ad02ddddf4b34745b5008f43b239bd"},
|
{file = "typer-0.15.4-py3-none-any.whl", hash = "sha256:eb0651654dcdea706780c466cf06d8f174405a659ffff8f163cfbfee98c0e173"},
|
||||||
{file = "typer-0.15.3.tar.gz", hash = "sha256:818873625d0569653438316567861899f7e9972f2e6e0c16dab608345ced713c"},
|
{file = "typer-0.15.4.tar.gz", hash = "sha256:89507b104f9b6a0730354f27c39fae5b63ccd0c95b1ce1f1a6ba0cfd329997c3"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
click = ">=8.0.0"
|
click = ">=8.0.0,<8.2"
|
||||||
rich = ">=10.11.0"
|
rich = ">=10.11.0"
|
||||||
shellingham = ">=1.3.0"
|
shellingham = ">=1.3.0"
|
||||||
typing-extensions = ">=3.7.4.3"
|
typing-extensions = ">=3.7.4.3"
|
||||||
@ -8332,15 +8293,15 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "types-requests"
|
name = "types-requests"
|
||||||
version = "2.32.0.20250328"
|
version = "2.32.0.20250515"
|
||||||
description = "Typing stubs for requests"
|
description = "Typing stubs for requests"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
groups = ["dev"]
|
groups = ["dev"]
|
||||||
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "types_requests-2.32.0.20250328-py3-none-any.whl", hash = "sha256:72ff80f84b15eb3aa7a8e2625fffb6a93f2ad5a0c20215fc1dcfa61117bcb2a2"},
|
{file = "types_requests-2.32.0.20250515-py3-none-any.whl", hash = "sha256:f8eba93b3a892beee32643ff836993f15a785816acca21ea0ffa006f05ef0fb2"},
|
||||||
{file = "types_requests-2.32.0.20250328.tar.gz", hash = "sha256:c9e67228ea103bd811c96984fac36ed2ae8da87a36a633964a21f199d60baf32"},
|
{file = "types_requests-2.32.0.20250515.tar.gz", hash = "sha256:09c8b63c11318cb2460813871aaa48b671002e59fda67ca909e9883777787581"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -8348,15 +8309,15 @@ urllib3 = ">=2"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "types-tqdm"
|
name = "types-tqdm"
|
||||||
version = "4.67.0.20250417"
|
version = "4.67.0.20250513"
|
||||||
description = "Typing stubs for tqdm"
|
description = "Typing stubs for tqdm"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
groups = ["dev"]
|
groups = ["dev"]
|
||||||
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
markers = "platform_system == \"Linux\" and sys_platform == \"darwin\" and (platform_machine == \"aarch64\" or platform_machine == \"x86_64\") or platform_machine == \"aarch64\" and platform_system == \"Linux\" or platform_machine == \"x86_64\" and sys_platform == \"darwin\""
|
||||||
files = [
|
files = [
|
||||||
{file = "types_tqdm-4.67.0.20250417-py3-none-any.whl", hash = "sha256:d43fc9a295be1f94083c744a09099c033c4dea293ff9a07bab9f34bfbffaaf80"},
|
{file = "types_tqdm-4.67.0.20250513-py3-none-any.whl", hash = "sha256:73d2bdac28bab49235d8660aece6c415636a0fb406f7a24b39737dfc6bf6a5dd"},
|
||||||
{file = "types_tqdm-4.67.0.20250417.tar.gz", hash = "sha256:bfcc4099d8d48df54e53f3ea64708cbcc1d1c4039ca7619594189da8c03c7be2"},
|
{file = "types_tqdm-4.67.0.20250513.tar.gz", hash = "sha256:907028c8d0a8fc20072132cd0cee72a3b6c72abf32f5ff914a7749e7d13b351e"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -9020,4 +8981,4 @@ vlm = ["accelerate", "transformers", "transformers"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "e768140b51251b4389be716e100ef12063c9186c4b9fdd3b8376ceaa7b87172e"
|
content-hash = "0476bc946feb1593633972b76c2aeb941951693e3501d742a33e88c9ad81a750"
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.31.1" # DO NOT EDIT, updated automatically
|
version = "2.32.0" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = [
|
authors = [
|
||||||
"Christoph Auer <cau@zurich.ibm.com>",
|
"Christoph Auer <cau@zurich.ibm.com>",
|
||||||
@ -90,6 +90,7 @@ pillow = ">=10.0.0,<12.0.0"
|
|||||||
tqdm = "^4.65.0"
|
tqdm = "^4.65.0"
|
||||||
pluggy = "^1.0.0"
|
pluggy = "^1.0.0"
|
||||||
pylatexenc = "^2.10"
|
pylatexenc = "^2.10"
|
||||||
|
click = "<8.2.0"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
python = "^3.9.2"
|
python = "^3.9.2"
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
||||||
<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
|
<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_50><loc_29><loc_89><loc_35></location>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
|
||||||
<table>
|
<table>
|
||||||
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
|
||||||
<row_0><col_0><body>0</col_0><col_1><body>1 2 1</col_1><col_2><body>1 2 1</col_2><col_3><body>1 2 1</col_3><col_4><body>1 2 1</col_4></row_0>
|
<row_0><col_0><body>0</col_0><col_1><body>1 2 1</col_1><col_2><body>1 2 1</col_2><col_3><body>1 2 1</col_3><col_4><body>1 2 1</col_4></row_0>
|
||||||
@ -57,6 +58,7 @@
|
|||||||
<location><page_3><loc_51><loc_68><loc_90><loc_90></location>
|
<location><page_3><loc_51><loc_68><loc_90><loc_90></location>
|
||||||
<caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
|
<caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_3><loc_50><loc_64><loc_89><loc_66></location>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
|
||||||
<paragraph><location><page_3><loc_50><loc_59><loc_71><loc_60></location>balance in the previous datasets.</paragraph>
|
<paragraph><location><page_3><loc_50><loc_59><loc_71><loc_60></location>balance in the previous datasets.</paragraph>
|
||||||
<paragraph><location><page_3><loc_50><loc_21><loc_89><loc_58></location>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</paragraph>
|
<paragraph><location><page_3><loc_50><loc_21><loc_89><loc_58></location>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</paragraph>
|
||||||
<paragraph><location><page_3><loc_50><loc_10><loc_89><loc_20></location>Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small</paragraph>
|
<paragraph><location><page_3><loc_50><loc_10><loc_89><loc_20></location>Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small</paragraph>
|
||||||
@ -88,10 +90,12 @@
|
|||||||
<location><page_5><loc_12><loc_77><loc_85><loc_90></location>
|
<location><page_5><loc_12><loc_77><loc_85><loc_90></location>
|
||||||
<caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
|
<caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_5><loc_8><loc_72><loc_89><loc_74></location>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_5><loc_9><loc_36><loc_47><loc_67></location>
|
<location><page_5><loc_9><loc_36><loc_47><loc_67></location>
|
||||||
<caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
|
<caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_5><loc_8><loc_14><loc_47><loc_33></location>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
|
||||||
<paragraph><location><page_5><loc_50><loc_63><loc_89><loc_68></location>forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .</paragraph>
|
<paragraph><location><page_5><loc_50><loc_63><loc_89><loc_68></location>forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .</paragraph>
|
||||||
<paragraph><location><page_5><loc_50><loc_48><loc_89><loc_62></location>Structure Decoder. The transformer architecture of this component is based on the work proposed in [31]. After extensive experimentation, the Structure Decoder is modeled as a transformer encoder with two encoder layers and a transformer decoder made from a stack of 4 decoder layers that comprise mainly of multi-head attention and feed forward layers. This configuration uses fewer layers and heads in comparison to networks applied to other problems (e.g. "Scene Understanding", "Image Captioning"), something which we relate to the simplicity of table images.</paragraph>
|
<paragraph><location><page_5><loc_50><loc_48><loc_89><loc_62></location>Structure Decoder. The transformer architecture of this component is based on the work proposed in [31]. After extensive experimentation, the Structure Decoder is modeled as a transformer encoder with two encoder layers and a transformer decoder made from a stack of 4 decoder layers that comprise mainly of multi-head attention and feed forward layers. This configuration uses fewer layers and heads in comparison to networks applied to other problems (e.g. "Scene Understanding", "Image Captioning"), something which we relate to the simplicity of table images.</paragraph>
|
||||||
<paragraph><location><page_5><loc_50><loc_31><loc_89><loc_47></location>The transformer encoder receives an encoded image from the CNN Backbone Network and refines it through a multi-head dot-product attention layer, followed by a Feed Forward Network. During training, the transformer decoder receives as input the output feature produced by the transformer encoder, and the tokenized input of the HTML ground-truth tags. Using a stack of multi-head attention layers, different aspects of the tag sequence could be inferred. This is achieved by each attention head on a layer operating in a different subspace, and then combining altogether their attention score.</paragraph>
|
<paragraph><location><page_5><loc_50><loc_31><loc_89><loc_47></location>The transformer encoder receives an encoded image from the CNN Backbone Network and refines it through a multi-head dot-product attention layer, followed by a Feed Forward Network. During training, the transformer decoder receives as input the output feature produced by the transformer encoder, and the tokenized input of the HTML ground-truth tags. Using a stack of multi-head attention layers, different aspects of the tag sequence could be inferred. This is achieved by each attention head on a layer operating in a different subspace, and then combining altogether their attention score.</paragraph>
|
||||||
@ -167,6 +171,7 @@
|
|||||||
<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
|
<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
|
||||||
<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
|
<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
|
||||||
<table>
|
<table>
|
||||||
<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
|
<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
|
||||||
<caption>Text is aligned to match original for ease of viewing</caption>
|
<caption>Text is aligned to match original for ease of viewing</caption>
|
||||||
@ -196,10 +201,12 @@
|
|||||||
<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
|
<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
|
||||||
<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
|
<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_10><loc_41><loc_87><loc_42></location>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
|
<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
|
||||||
<caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
|
<caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_8><loc_54><loc_89><loc_59></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_8><loc_63><loc_44><loc_89><loc_52></location>
|
<location><page_8><loc_63><loc_44><loc_89><loc_52></location>
|
||||||
</figure>
|
</figure>
|
||||||
@ -269,6 +276,7 @@
|
|||||||
<location><page_12><loc_9><loc_81><loc_89><loc_91></location>
|
<location><page_12><loc_9><loc_81><loc_89><loc_91></location>
|
||||||
<caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
<caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_12><loc_8><loc_76><loc_89><loc_79></location>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
||||||
<paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>- · TableFormer output does not include the table cell content.</paragraph>
|
<paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>- · TableFormer output does not include the table cell content.</paragraph>
|
||||||
<paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>- · There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
|
<paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>- · There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
|
||||||
<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
|
<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
|
||||||
@ -373,6 +381,7 @@
|
|||||||
<location><page_14><loc_52><loc_55><loc_87><loc_89></location>
|
<location><page_14><loc_52><loc_55><loc_87><loc_89></location>
|
||||||
<caption>Figure 13: Table predictions example on colorful table.</caption>
|
<caption>Figure 13: Table predictions example on colorful table.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_14><loc_52><loc_52><loc_88><loc_53></location>Figure 13: Table predictions example on colorful table.</caption>
|
||||||
<table>
|
<table>
|
||||||
<location><page_14><loc_52><loc_40><loc_85><loc_46></location>
|
<location><page_14><loc_52><loc_40><loc_85><loc_46></location>
|
||||||
<caption>Figure 14: Example with multi-line text.</caption>
|
<caption>Figure 14: Example with multi-line text.</caption>
|
||||||
@ -433,4 +442,5 @@
|
|||||||
<location><page_16><loc_11><loc_37><loc_86><loc_68></location>
|
<location><page_16><loc_11><loc_37><loc_86><loc_68></location>
|
||||||
<caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
|
<caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_16><loc_8><loc_33><loc_89><loc_36></location>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
|
||||||
</document>
|
</document>
|
@ -365,6 +365,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/2"
|
"$ref": "#/figures/2"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
308.862,
|
||||||
|
232.72709999999995,
|
||||||
|
545.11517,
|
||||||
|
277.49963
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
220
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Table",
|
"name": "Table",
|
||||||
"type": "table",
|
"type": "table",
|
||||||
@ -904,6 +927,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/3"
|
"$ref": "#/figures/3"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
308.862,
|
||||||
|
503.3020900000001,
|
||||||
|
545.11511,
|
||||||
|
524.16364
|
||||||
|
],
|
||||||
|
"page": 3,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
104
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1282,11 +1328,57 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/4"
|
"$ref": "#/figures/4"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.111992,
|
||||||
|
567.03308,
|
||||||
|
545.10846,
|
||||||
|
588.01422
|
||||||
|
],
|
||||||
|
"page": 5,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
212
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/5"
|
"$ref": "#/figures/5"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
111.72906,
|
||||||
|
286.36597,
|
||||||
|
264.2171900000001
|
||||||
|
],
|
||||||
|
"page": 5,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
745
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2214,6 +2306,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/7"
|
"$ref": "#/figures/7"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
53.811783000000005,
|
||||||
|
575.89355,
|
||||||
|
385.93451,
|
||||||
|
583.76672
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
79
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "b. Structure predicted by TableFormer, with superimposed matched PDF cell text:",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Table",
|
"name": "Table",
|
||||||
"type": "table",
|
"type": "table",
|
||||||
@ -2252,11 +2367,57 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/8"
|
"$ref": "#/figures/8"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
62.595001,
|
||||||
|
324.36508,
|
||||||
|
532.63049,
|
||||||
|
333.27164
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
112
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/9"
|
"$ref": "#/figures/9"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
426.35013,
|
||||||
|
545.11377,
|
||||||
|
471.12265
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
397
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
@ -3707,6 +3868,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/11"
|
"$ref": "#/figures/11"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
605.63605,
|
||||||
|
545.11371,
|
||||||
|
626.49762
|
||||||
|
],
|
||||||
|
"page": 12,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
245
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -4517,6 +4701,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/16"
|
"$ref": "#/figures/16"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
315.79001,
|
||||||
|
411.40909,
|
||||||
|
538.18524,
|
||||||
|
420.31564
|
||||||
|
],
|
||||||
|
"page": 14,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
55
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 13: Table predictions example on colorful table.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Table",
|
"name": "Table",
|
||||||
"type": "table",
|
"type": "table",
|
||||||
@ -4675,6 +4882,29 @@
|
|||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/23"
|
"$ref": "#/figures/23"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
50.112,
|
||||||
|
262.80108999999993,
|
||||||
|
545.11383,
|
||||||
|
283.66263
|
||||||
|
],
|
||||||
|
"page": 16,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
153
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"figures": [
|
"figures": [
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
<location><page_1><loc_53><loc_34><loc_90><loc_68></location>
|
<location><page_1><loc_53><loc_34><loc_90><loc_68></location>
|
||||||
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
|
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_52><loc_29><loc_91><loc_32></location>Figure 1: Four examples of complex page layouts across different document categories</caption>
|
||||||
<subtitle-level-1><location><page_1><loc_52><loc_24><loc_62><loc_25></location>KEYWORDS</subtitle-level-1>
|
<subtitle-level-1><location><page_1><loc_52><loc_24><loc_62><loc_25></location>KEYWORDS</subtitle-level-1>
|
||||||
<paragraph><location><page_1><loc_52><loc_21><loc_91><loc_23></location>PDF document conversion, layout segmentation, object-detection, data set, Machine Learning</paragraph>
|
<paragraph><location><page_1><loc_52><loc_21><loc_91><loc_23></location>PDF document conversion, layout segmentation, object-detection, data set, Machine Learning</paragraph>
|
||||||
<subtitle-level-1><location><page_1><loc_52><loc_18><loc_66><loc_19></location>ACM Reference Format:</subtitle-level-1>
|
<subtitle-level-1><location><page_1><loc_52><loc_18><loc_66><loc_19></location>ACM Reference Format:</subtitle-level-1>
|
||||||
@ -44,6 +45,7 @@
|
|||||||
<location><page_3><loc_14><loc_72><loc_43><loc_88></location>
|
<location><page_3><loc_14><loc_72><loc_43><loc_88></location>
|
||||||
<caption>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
|
<caption>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_3><loc_9><loc_68><loc_48><loc_70></location>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
|
||||||
<paragraph><location><page_3><loc_9><loc_54><loc_48><loc_64></location>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".</paragraph>
|
<paragraph><location><page_3><loc_9><loc_54><loc_48><loc_64></location>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".</paragraph>
|
||||||
<paragraph><location><page_3><loc_9><loc_36><loc_48><loc_53></location>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</paragraph>
|
<paragraph><location><page_3><loc_9><loc_36><loc_48><loc_53></location>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</paragraph>
|
||||||
<paragraph><location><page_3><loc_9><loc_23><loc_48><loc_35></location>We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.</paragraph>
|
<paragraph><location><page_3><loc_9><loc_23><loc_48><loc_35></location>We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.</paragraph>
|
||||||
@ -76,6 +78,7 @@
|
|||||||
<location><page_4><loc_9><loc_32><loc_48><loc_61></location>
|
<location><page_4><loc_9><loc_32><loc_48><loc_61></location>
|
||||||
<caption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
|
<caption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_4><loc_9><loc_23><loc_48><loc_30></location>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
|
||||||
<paragraph><location><page_4><loc_9><loc_15><loc_48><loc_20></location>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</paragraph>
|
<paragraph><location><page_4><loc_9><loc_15><loc_48><loc_20></location>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</paragraph>
|
||||||
<paragraph><location><page_4><loc_9><loc_11><loc_48><loc_14></location><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</paragraph>
|
<paragraph><location><page_4><loc_9><loc_11><loc_48><loc_14></location><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</paragraph>
|
||||||
<paragraph><location><page_4><loc_52><loc_36><loc_91><loc_52></location>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</paragraph>
|
<paragraph><location><page_4><loc_52><loc_36><loc_91><loc_52></location>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</paragraph>
|
||||||
@ -123,6 +126,7 @@
|
|||||||
<location><page_6><loc_53><loc_67><loc_90><loc_89></location>
|
<location><page_6><loc_53><loc_67><loc_90><loc_89></location>
|
||||||
<caption>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
|
<caption>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_6><loc_52><loc_57><loc_91><loc_65></location>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
|
||||||
<paragraph><location><page_6><loc_52><loc_49><loc_91><loc_52></location>paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.</paragraph>
|
<paragraph><location><page_6><loc_52><loc_49><loc_91><loc_52></location>paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.</paragraph>
|
||||||
<paragraph><location><page_6><loc_52><loc_39><loc_91><loc_49></location>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</paragraph>
|
<paragraph><location><page_6><loc_52><loc_39><loc_91><loc_49></location>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</paragraph>
|
||||||
<subtitle-level-1><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</subtitle-level-1>
|
<subtitle-level-1><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</subtitle-level-1>
|
||||||
@ -216,6 +220,7 @@
|
|||||||
<location><page_9><loc_9><loc_44><loc_91><loc_89></location>
|
<location><page_9><loc_9><loc_44><loc_91><loc_89></location>
|
||||||
<caption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
|
<caption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_9><loc_10><loc_43><loc_52><loc_44></location>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
|
||||||
<paragraph><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</paragraph>
|
<paragraph><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</paragraph>
|
||||||
<paragraph><location><page_9><loc_11><loc_31><loc_48><loc_33></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</paragraph>
|
<paragraph><location><page_9><loc_11><loc_31><loc_48><loc_33></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</paragraph>
|
||||||
<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_33></location>- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
|
<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_33></location>- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
|
||||||
|
@ -430,6 +430,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
317.95499,
|
||||||
|
232.48476000000005,
|
||||||
|
559.80579,
|
||||||
|
251.91701
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
84
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1: Four examples of complex page layouts across different document categories",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -964,6 +987,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
53.79800000000001,
|
||||||
|
536.45276,
|
||||||
|
294.04373,
|
||||||
|
555.88501
|
||||||
|
],
|
||||||
|
"page": 3,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
69
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 2: Distribution of DocLayNet pages across document categories.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1227,6 +1273,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/2"
|
"$ref": "#/figures/2"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
53.79800000000001,
|
||||||
|
185.68075999999996,
|
||||||
|
295.64874,
|
||||||
|
237.99000999999998
|
||||||
|
],
|
||||||
|
"page": 4,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
281
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1808,6 +1877,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/4"
|
"$ref": "#/figures/4"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
317.95499,
|
||||||
|
449.71581999999995,
|
||||||
|
559.80579,
|
||||||
|
512.98401
|
||||||
|
],
|
||||||
|
"page": 6,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
329
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2702,6 +2794,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/5"
|
"$ref": "#/figures/5"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
62.323874999999994,
|
||||||
|
343.73517,
|
||||||
|
318.50473,
|
||||||
|
349.71457
|
||||||
|
],
|
||||||
|
"page": 9,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
89
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -213,10 +213,10 @@
|
|||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
"bbox": [
|
"bbox": [
|
||||||
139.66741943359375,
|
139.6674041748047,
|
||||||
322.5054626464844,
|
322.5054626464844,
|
||||||
475.00927734375,
|
475.00927734375,
|
||||||
454.45458984375
|
454.4546203613281
|
||||||
],
|
],
|
||||||
"page": 1,
|
"page": 1,
|
||||||
"span": [
|
"span": [
|
||||||
|
@ -2646,7 +2646,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373533129692078,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -2726,7 +2726,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -2881,7 +2881,7 @@
|
|||||||
"b": 255.42400999999995,
|
"b": 255.42400999999995,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.98504239320755,
|
"confidence": 0.9850425124168396,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 7,
|
"index": 7,
|
||||||
@ -3096,7 +3096,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591907262802124,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -3280,8 +3280,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -7787,7 +7787,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -7852,7 +7852,7 @@
|
|||||||
"b": 618.3,
|
"b": 618.3,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9849975109100342,
|
"confidence": 0.9849976301193237,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 93,
|
"index": 93,
|
||||||
@ -8184,8 +8184,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -13582,7 +13582,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373533129692078,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -13674,7 +13674,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -13841,7 +13841,7 @@
|
|||||||
"b": 255.42400999999995,
|
"b": 255.42400999999995,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.98504239320755,
|
"confidence": 0.9850425124168396,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 7,
|
"index": 7,
|
||||||
@ -14062,7 +14062,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591907262802124,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -14252,8 +14252,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -19642,7 +19642,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -19713,7 +19713,7 @@
|
|||||||
"b": 618.3,
|
"b": 618.3,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9849975109100342,
|
"confidence": 0.9849976301193237,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 93,
|
"index": 93,
|
||||||
@ -20057,7 +20057,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -20224,7 +20224,7 @@
|
|||||||
"b": 255.42400999999995,
|
"b": 255.42400999999995,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.98504239320755,
|
"confidence": 0.9850425124168396,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 7,
|
"index": 7,
|
||||||
@ -20445,7 +20445,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591907262802124,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -20635,8 +20635,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -26025,7 +26025,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -26096,7 +26096,7 @@
|
|||||||
"b": 618.3,
|
"b": 618.3,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9849975109100342,
|
"confidence": 0.9849976301193237,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 93,
|
"index": 93,
|
||||||
@ -26440,7 +26440,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373533129692078,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
<location><page_2><loc_24><loc_46><loc_76><loc_74></location>
|
<location><page_2><loc_24><loc_46><loc_76><loc_74></location>
|
||||||
<caption>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
|
<caption>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_2><loc_22><loc_75><loc_79><loc_84></location>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
|
||||||
<paragraph><location><page_2><loc_22><loc_34><loc_79><loc_43></location>today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].</paragraph>
|
<paragraph><location><page_2><loc_22><loc_34><loc_79><loc_43></location>today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].</paragraph>
|
||||||
<paragraph><location><page_2><loc_22><loc_16><loc_79><loc_34></location>Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTabNet [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR.</paragraph>
|
<paragraph><location><page_2><loc_22><loc_16><loc_79><loc_34></location>Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTabNet [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR.</paragraph>
|
||||||
<paragraph><location><page_3><loc_22><loc_73><loc_79><loc_85></location>While the majority of research in TSR is currently focused on the development and application of novel neural model architectures, the table structure representation language (e.g. HTML in PubTabNet and FinTabNet) is usually adopted as is for the sequence tokenization in Im2Seq models. In this paper, we aim for the opposite and investigate the impact of the table structure representation language with an otherwise unmodified Im2Seq transformer-based architecture. Since the current state-of-the-art Im2Seq model is TableFormer [9], we select this model to perform our experiments.</paragraph>
|
<paragraph><location><page_3><loc_22><loc_73><loc_79><loc_85></location>While the majority of research in TSR is currently focused on the development and application of novel neural model architectures, the table structure representation language (e.g. HTML in PubTabNet and FinTabNet) is usually adopted as is for the sequence tokenization in Im2Seq models. In this paper, we aim for the opposite and investigate the impact of the table structure representation language with an otherwise unmodified Im2Seq transformer-based architecture. Since the current state-of-the-art Im2Seq model is TableFormer [9], we select this model to perform our experiments.</paragraph>
|
||||||
@ -30,6 +31,7 @@
|
|||||||
<location><page_5><loc_22><loc_57><loc_78><loc_71></location>
|
<location><page_5><loc_22><loc_57><loc_78><loc_71></location>
|
||||||
<caption>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
|
<caption>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_5><loc_24><loc_71><loc_77><loc_72></location>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
|
||||||
<paragraph><location><page_5><loc_22><loc_33><loc_79><loc_54></location>Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.</paragraph>
|
<paragraph><location><page_5><loc_22><loc_33><loc_79><loc_54></location>Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.</paragraph>
|
||||||
<paragraph><location><page_5><loc_22><loc_27><loc_79><loc_32></location>Additionally, it would be desirable if the representation would easily allow an early detection of invalid sequences on-the-go, before the prediction of the entire table structure is completed. HTML is not well-suited for this purpose as the verification of incomplete sequences is non-trivial or even impossible.</paragraph>
|
<paragraph><location><page_5><loc_22><loc_27><loc_79><loc_32></location>Additionally, it would be desirable if the representation would easily allow an early detection of invalid sequences on-the-go, before the prediction of the entire table structure is completed. HTML is not well-suited for this purpose as the verification of incomplete sequences is non-trivial or even impossible.</paragraph>
|
||||||
<paragraph><location><page_5><loc_22><loc_16><loc_79><loc_26></location>In a valid HTML table, the token sequence must describe a 2D grid of table cells, serialised in row-major ordering, where each row and each column have the same length (while considering row- and column-spans). Furthermore, every opening tag in HTML needs to be matched by a closing tag in a correct hierarchical manner. Since the number of tokens for each table row and column can vary significantly, especially for large tables with many row- and column-spans, it is complex to verify the consistency of predicted structures during sequence</paragraph>
|
<paragraph><location><page_5><loc_22><loc_16><loc_79><loc_26></location>In a valid HTML table, the token sequence must describe a 2D grid of table cells, serialised in row-major ordering, where each row and each column have the same length (while considering row- and column-spans). Furthermore, every opening tag in HTML needs to be matched by a closing tag in a correct hierarchical manner. Since the number of tokens for each table row and column can vary significantly, especially for large tables with many row- and column-spans, it is complex to verify the consistency of predicted structures during sequence</paragraph>
|
||||||
@ -50,6 +52,7 @@
|
|||||||
<location><page_7><loc_27><loc_65><loc_73><loc_79></location>
|
<location><page_7><loc_27><loc_65><loc_73><loc_79></location>
|
||||||
<caption>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
|
<caption>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_7><loc_22><loc_80><loc_79><loc_84></location>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
|
||||||
<subtitle-level-1><location><page_7><loc_22><loc_60><loc_40><loc_61></location>4.2 Language Syntax</subtitle-level-1>
|
<subtitle-level-1><location><page_7><loc_22><loc_60><loc_40><loc_61></location>4.2 Language Syntax</subtitle-level-1>
|
||||||
<paragraph><location><page_7><loc_22><loc_58><loc_59><loc_59></location>The OTSL representation follows these syntax rules:</paragraph>
|
<paragraph><location><page_7><loc_22><loc_58><loc_59><loc_59></location>The OTSL representation follows these syntax rules:</paragraph>
|
||||||
<paragraph><location><page_7><loc_23><loc_54><loc_79><loc_56></location>- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</paragraph>
|
<paragraph><location><page_7><loc_23><loc_54><loc_79><loc_56></location>- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</paragraph>
|
||||||
@ -70,6 +73,7 @@
|
|||||||
<location><page_8><loc_23><loc_25><loc_77><loc_36></location>
|
<location><page_8><loc_23><loc_25><loc_77><loc_36></location>
|
||||||
<caption>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
|
<caption>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_8><loc_22><loc_36><loc_79><loc_39></location>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
|
||||||
<paragraph><location><page_8><loc_22><loc_16><loc_79><loc_22></location>We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in</paragraph>
|
<paragraph><location><page_8><loc_22><loc_16><loc_79><loc_22></location>We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in</paragraph>
|
||||||
<paragraph><location><page_9><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</paragraph>
|
<paragraph><location><page_9><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</paragraph>
|
||||||
<subtitle-level-1><location><page_9><loc_22><loc_78><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</subtitle-level-1>
|
<subtitle-level-1><location><page_9><loc_22><loc_78><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</subtitle-level-1>
|
||||||
@ -104,12 +108,14 @@
|
|||||||
<location><page_10><loc_27><loc_16><loc_74><loc_44></location>
|
<location><page_10><loc_27><loc_16><loc_74><loc_44></location>
|
||||||
<caption>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
|
<caption>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_10><loc_22><loc_44><loc_79><loc_50></location>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
|
||||||
<paragraph><location><page_10><loc_37><loc_15><loc_38><loc_16></location>μ</paragraph>
|
<paragraph><location><page_10><loc_37><loc_15><loc_38><loc_16></location>μ</paragraph>
|
||||||
<paragraph><location><page_10><loc_49><loc_12><loc_49><loc_14></location>≥</paragraph>
|
<paragraph><location><page_10><loc_49><loc_12><loc_49><loc_14></location>≥</paragraph>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_11><loc_28><loc_20><loc_73><loc_77></location>
|
<location><page_11><loc_28><loc_20><loc_73><loc_77></location>
|
||||||
<caption>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
|
<caption>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_11><loc_22><loc_78><loc_79><loc_84></location>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
|
||||||
<subtitle-level-1><location><page_12><loc_22><loc_84><loc_36><loc_85></location>6 Conclusion</subtitle-level-1>
|
<subtitle-level-1><location><page_12><loc_22><loc_84><loc_36><loc_85></location>6 Conclusion</subtitle-level-1>
|
||||||
<paragraph><location><page_12><loc_22><loc_74><loc_79><loc_81></location>We demonstrated that representing tables in HTML for the task of table structure recognition with Im2Seq models is ill-suited and has serious limitations. Furthermore, we presented in this paper an Optimized Table Structure Language (OTSL) which, when compared to commonly used general purpose languages, has several key benefits.</paragraph>
|
<paragraph><location><page_12><loc_22><loc_74><loc_79><loc_81></location>We demonstrated that representing tables in HTML for the task of table structure recognition with Im2Seq models is ill-suited and has serious limitations. Furthermore, we presented in this paper an Optimized Table Structure Language (OTSL) which, when compared to commonly used general purpose languages, has several key benefits.</paragraph>
|
||||||
<paragraph><location><page_12><loc_22><loc_59><loc_79><loc_74></location>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</paragraph>
|
<paragraph><location><page_12><loc_22><loc_59><loc_79><loc_74></location>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</paragraph>
|
||||||
|
@ -340,6 +340,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
591.77942,
|
||||||
|
480.59189,
|
||||||
|
665.66583
|
||||||
|
],
|
||||||
|
"page": 2,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
574
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -644,6 +667,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
145.60701,
|
||||||
|
562.78821,
|
||||||
|
469.75223000000005,
|
||||||
|
570.92072
|
||||||
|
],
|
||||||
|
"page": 5,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
73
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1017,6 +1063,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/2"
|
"$ref": "#/figures/2"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
636.15033,
|
||||||
|
480.5874,
|
||||||
|
666.2008100000002
|
||||||
|
],
|
||||||
|
"page": 7,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
207
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1390,6 +1459,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/3"
|
"$ref": "#/figures/3"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.76501,
|
||||||
|
288.26035,
|
||||||
|
480.59082,
|
||||||
|
307.35187
|
||||||
|
],
|
||||||
|
"page": 8,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
104
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1658,6 +1750,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/4"
|
"$ref": "#/figures/4"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
352.28284,
|
||||||
|
480.59106,
|
||||||
|
394.40988
|
||||||
|
],
|
||||||
|
"page": 10,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
270
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). \"PMC2807444_006_00.png\" PubTabNet. \u03bc",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -1709,6 +1824,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/5"
|
"$ref": "#/figures/5"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
134.765,
|
||||||
|
614.23236,
|
||||||
|
480.58838000000003,
|
||||||
|
666.2008100000002
|
||||||
|
],
|
||||||
|
"page": 11,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
390
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. \"PMC5406406_003_01.png\" PubTabNet.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
<location><page_1><loc_12><loc_10><loc_52><loc_31></location>
|
<location><page_1><loc_12><loc_10><loc_52><loc_31></location>
|
||||||
<caption>Figure 7-26. Self-locking nuts.</caption>
|
<caption>Figure 7-26. Self-locking nuts.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_12><loc_8><loc_31><loc_9></location>Figure 7-26. Self-locking nuts.</caption>
|
||||||
<paragraph><location><page_1><loc_54><loc_85><loc_95><loc_94></location>the most common ranges in size for No. 6 up to 1 / 4 inch, the Rol-top ranges from 1 / 4 inch to 1 / 6 inch, and the bellows type ranges in size from No. 8 up to 3 / 8 inch. Wing-type nuts are made of anodized aluminum alloy, cadmium-plated carbon steel, or stainless steel. The Rol-top nut is cadmium-plated steel, and the bellows type is made of aluminum alloy only.</paragraph>
|
<paragraph><location><page_1><loc_54><loc_85><loc_95><loc_94></location>the most common ranges in size for No. 6 up to 1 / 4 inch, the Rol-top ranges from 1 / 4 inch to 1 / 6 inch, and the bellows type ranges in size from No. 8 up to 3 / 8 inch. Wing-type nuts are made of anodized aluminum alloy, cadmium-plated carbon steel, or stainless steel. The Rol-top nut is cadmium-plated steel, and the bellows type is made of aluminum alloy only.</paragraph>
|
||||||
<paragraph><location><page_1><loc_54><loc_83><loc_55><loc_85></location>.</paragraph>
|
<paragraph><location><page_1><loc_54><loc_83><loc_55><loc_85></location>.</paragraph>
|
||||||
<subtitle-level-1><location><page_1><loc_54><loc_82><loc_76><loc_83></location>Stainless Steel Self-Locking Nut</subtitle-level-1>
|
<subtitle-level-1><location><page_1><loc_54><loc_82><loc_76><loc_83></location>Stainless Steel Self-Locking Nut</subtitle-level-1>
|
||||||
@ -20,4 +21,5 @@
|
|||||||
<location><page_1><loc_54><loc_11><loc_94><loc_46></location>
|
<location><page_1><loc_54><loc_11><loc_94><loc_46></location>
|
||||||
<caption>Figure 7-27. Stainless steel self-locking nut.</caption>
|
<caption>Figure 7-27. Stainless steel self-locking nut.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_54><loc_8><loc_81><loc_10></location>Figure 7-27. Stainless steel self-locking nut.</caption>
|
||||||
</document>
|
</document>
|
@ -206,6 +206,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
72.0,
|
||||||
|
60.99040200000002,
|
||||||
|
184.14828,
|
||||||
|
71.80239900000004
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
31
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 7-26. Self-locking nuts.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -348,6 +371,29 @@
|
|||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
321.0,
|
||||||
|
63.010403,
|
||||||
|
481.64931999999993,
|
||||||
|
73.82240300000001
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
46
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 7-27. Stainless steel self-locking nut.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"figures": [
|
"figures": [
|
||||||
|
@ -5,11 +5,13 @@
|
|||||||
<location><page_1><loc_22><loc_36><loc_78><loc_62></location>
|
<location><page_1><loc_22><loc_36><loc_78><loc_62></location>
|
||||||
<caption>Figure 1: This is an example image.</caption>
|
<caption>Figure 1: This is an example image.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_1><loc_37><loc_32><loc_63><loc_33></location>Figure 1: This is an example image.</caption>
|
||||||
<paragraph><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</paragraph>
|
<paragraph><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</paragraph>
|
||||||
<paragraph><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
|
<paragraph><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_2><loc_36><loc_36><loc_64><loc_65></location>
|
<location><page_2><loc_36><loc_36><loc_64><loc_65></location>
|
||||||
<caption>Figure 2: This is an example image.</caption>
|
<caption>Figure 2: This is an example image.</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_2><loc_37><loc_33><loc_63><loc_34></location>Figure 2: This is an example image.</caption>
|
||||||
<paragraph><location><page_2><loc_22><loc_15><loc_78><loc_31></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</paragraph>
|
<paragraph><location><page_2><loc_22><loc_15><loc_78><loc_31></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</paragraph>
|
||||||
</document>
|
</document>
|
@ -96,6 +96,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/0"
|
"$ref": "#/figures/0"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
226.89101,
|
||||||
|
254.01826000000005,
|
||||||
|
384.3548,
|
||||||
|
262.86505
|
||||||
|
],
|
||||||
|
"page": 1,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
35
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1: This is an example image.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -147,6 +170,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/1"
|
"$ref": "#/figures/1"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
226.89101,
|
||||||
|
259.94226000000003,
|
||||||
|
384.3548,
|
||||||
|
268.78903
|
||||||
|
],
|
||||||
|
"page": 2,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
35
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 2: This is an example image.",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -87,6 +87,7 @@
|
|||||||
<location><page_7><loc_22><loc_13><loc_89><loc_53></location>
|
<location><page_7><loc_22><loc_13><loc_89><loc_53></location>
|
||||||
<caption>Figure 1-2 Existing row and column controls</caption>
|
<caption>Figure 1-2 Existing row and column controls</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_7><loc_22><loc_12><loc_52><loc_13></location>Figure 1-2 Existing row and column controls</caption>
|
||||||
<subtitle-level-1><location><page_8><loc_11><loc_89><loc_55><loc_91></location>2.1.6 Change Function Usage CL command</subtitle-level-1>
|
<subtitle-level-1><location><page_8><loc_11><loc_89><loc_55><loc_91></location>2.1.6 Change Function Usage CL command</subtitle-level-1>
|
||||||
<paragraph><location><page_8><loc_22><loc_87><loc_89><loc_88></location>The following CL commands can be used to work with, display, or change function usage IDs:</paragraph>
|
<paragraph><location><page_8><loc_22><loc_87><loc_89><loc_88></location>The following CL commands can be used to work with, display, or change function usage IDs:</paragraph>
|
||||||
<paragraph><location><page_8><loc_22><loc_84><loc_49><loc_86></location>- GLYPH<SM590000> Work Function Usage ( WRKFCNUSG )</paragraph>
|
<paragraph><location><page_8><loc_22><loc_84><loc_49><loc_86></location>- GLYPH<SM590000> Work Function Usage ( WRKFCNUSG )</paragraph>
|
||||||
@ -150,6 +151,7 @@
|
|||||||
<location><page_10><loc_22><loc_48><loc_89><loc_86></location>
|
<location><page_10><loc_22><loc_48><loc_89><loc_86></location>
|
||||||
<caption>Figure 3-1 CREATE PERMISSION SQL statement</caption>
|
<caption>Figure 3-1 CREATE PERMISSION SQL statement</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_10><loc_22><loc_47><loc_56><loc_48></location>Figure 3-1 CREATE PERMISSION SQL statement</caption>
|
||||||
<subtitle-level-1><location><page_10><loc_22><loc_43><loc_35><loc_44></location>Column mask</subtitle-level-1>
|
<subtitle-level-1><location><page_10><loc_22><loc_43><loc_35><loc_44></location>Column mask</subtitle-level-1>
|
||||||
<paragraph><location><page_10><loc_22><loc_37><loc_89><loc_43></location>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</paragraph>
|
<paragraph><location><page_10><loc_22><loc_37><loc_89><loc_43></location>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</paragraph>
|
||||||
<caption><location><page_11><loc_22><loc_90><loc_67><loc_91></location>Table 3-1 summarizes these special registers and their values.</caption>
|
<caption><location><page_11><loc_22><loc_90><loc_67><loc_91></location>Table 3-1 summarizes these special registers and their values.</caption>
|
||||||
@ -172,6 +174,7 @@
|
|||||||
<location><page_11><loc_22><loc_25><loc_49><loc_51></location>
|
<location><page_11><loc_22><loc_25><loc_49><loc_51></location>
|
||||||
<caption>Figure 3-5 Special registers and adopted authority</caption>
|
<caption>Figure 3-5 Special registers and adopted authority</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_11><loc_22><loc_24><loc_56><loc_25></location>Figure 3-5 Special registers and adopted authority</caption>
|
||||||
<subtitle-level-1><location><page_11><loc_11><loc_20><loc_40><loc_21></location>3.2.2 Built-in global variables</subtitle-level-1>
|
<subtitle-level-1><location><page_11><loc_11><loc_20><loc_40><loc_21></location>3.2.2 Built-in global variables</subtitle-level-1>
|
||||||
<paragraph><location><page_11><loc_22><loc_15><loc_85><loc_18></location>Built-in global variables are provided with the database manager and are used in SQL statements to retrieve scalar values that are associated with the variables.</paragraph>
|
<paragraph><location><page_11><loc_22><loc_15><loc_85><loc_18></location>Built-in global variables are provided with the database manager and are used in SQL statements to retrieve scalar values that are associated with the variables.</paragraph>
|
||||||
<paragraph><location><page_11><loc_22><loc_9><loc_87><loc_13></location>IBM DB2 for i supports nine different built-in global variables that are read only and maintained by the system. These global variables can be used to identify attributes of the database connection and used as part of the RCAC logic.</paragraph>
|
<paragraph><location><page_11><loc_22><loc_9><loc_87><loc_13></location>IBM DB2 for i supports nine different built-in global variables that are read only and maintained by the system. These global variables can be used to identify attributes of the database connection and used as part of the RCAC logic.</paragraph>
|
||||||
@ -215,6 +218,7 @@
|
|||||||
<location><page_14><loc_10><loc_79><loc_89><loc_88></location>
|
<location><page_14><loc_10><loc_79><loc_89><loc_88></location>
|
||||||
<caption>Figure 3-10 Column masks shown in System i Navigator</caption>
|
<caption>Figure 3-10 Column masks shown in System i Navigator</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_14><loc_11><loc_77><loc_48><loc_78></location>Figure 3-10 Column masks shown in System i Navigator</caption>
|
||||||
<subtitle-level-1><location><page_14><loc_11><loc_73><loc_33><loc_74></location>3.6.6 Activating RCAC</subtitle-level-1>
|
<subtitle-level-1><location><page_14><loc_11><loc_73><loc_33><loc_74></location>3.6.6 Activating RCAC</subtitle-level-1>
|
||||||
<paragraph><location><page_14><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</paragraph>
|
<paragraph><location><page_14><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</paragraph>
|
||||||
<paragraph><location><page_14><loc_22><loc_65><loc_67><loc_66></location>- 1. Run the SQL statements that are shown in Example 3-10.</paragraph>
|
<paragraph><location><page_14><loc_22><loc_65><loc_67><loc_66></location>- 1. Run the SQL statements that are shown in Example 3-10.</paragraph>
|
||||||
@ -230,16 +234,19 @@
|
|||||||
<location><page_14><loc_10><loc_18><loc_87><loc_46></location>
|
<location><page_14><loc_10><loc_18><loc_87><loc_46></location>
|
||||||
<caption>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
|
<caption>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_14><loc_11><loc_17><loc_57><loc_18></location>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
|
||||||
<paragraph><location><page_15><loc_22><loc_87><loc_84><loc_91></location>- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</paragraph>
|
<paragraph><location><page_15><loc_22><loc_87><loc_84><loc_91></location>- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</paragraph>
|
||||||
<paragraph><location><page_15><loc_22><loc_32><loc_89><loc_36></location>- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</paragraph>
|
<paragraph><location><page_15><loc_22><loc_32><loc_89><loc_36></location>- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</paragraph>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_15><loc_22><loc_40><loc_89><loc_85></location>
|
<location><page_15><loc_22><loc_40><loc_89><loc_85></location>
|
||||||
<caption>Figure 4-68 Visual Explain with RCAC enabled</caption>
|
<caption>Figure 4-68 Visual Explain with RCAC enabled</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_15><loc_22><loc_38><loc_53><loc_39></location>Figure 4-68 Visual Explain with RCAC enabled</caption>
|
||||||
<figure>
|
<figure>
|
||||||
<location><page_15><loc_11><loc_16><loc_83><loc_30></location>
|
<location><page_15><loc_11><loc_16><loc_83><loc_30></location>
|
||||||
<caption>Figure 4-69 Index advice with no RCAC</caption>
|
<caption>Figure 4-69 Index advice with no RCAC</caption>
|
||||||
</figure>
|
</figure>
|
||||||
|
<caption><location><page_15><loc_11><loc_15><loc_37><loc_16></location>Figure 4-69 Index advice with no RCAC</caption>
|
||||||
<paragraph><location><page_16><loc_11><loc_11><loc_82><loc_91></location>THEN C . CUSTOMER_TAX_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( C . CUSTOMER_TAX_ID , 8 , 4 ) ) WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_DRIVERS_LICENSE_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_DRIVERS_LICENSE_NUMBER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER ELSE '*************' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_LOGIN_ID_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_LOGIN_ID RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_LOGIN_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_LOGIN_ID ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ANSWER_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION_ANSWER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER ELSE '*****' END ENABLE ; ALTER TABLE BANK_SCHEMA.CUSTOMERS ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL ;</paragraph>
|
<paragraph><location><page_16><loc_11><loc_11><loc_82><loc_91></location>THEN C . CUSTOMER_TAX_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( C . CUSTOMER_TAX_ID , 8 , 4 ) ) WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_DRIVERS_LICENSE_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_DRIVERS_LICENSE_NUMBER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'TELLER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_DRIVERS_LICENSE_NUMBER ELSE '*************' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_LOGIN_ID_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_LOGIN_ID RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_LOGIN_ID WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_LOGIN_ID ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION ELSE '*****' END ENABLE ; CREATE MASK BANK_SCHEMA.MASK_SECURITY_QUESTION_ANSWER_ON_CUSTOMERS ON BANK_SCHEMA.CUSTOMERS AS C FOR COLUMN CUSTOMER_SECURITY_QUESTION_ANSWER RETURN CASE WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'ADMIN' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER WHEN QSYS2 . VERIFY_GROUP_FOR_USER ( SESSION_USER , 'CUSTOMER' ) = 1 THEN C . CUSTOMER_SECURITY_QUESTION_ANSWER ELSE '*****' END ENABLE ; ALTER TABLE BANK_SCHEMA.CUSTOMERS ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL ;</paragraph>
|
||||||
<paragraph><location><page_18><loc_47><loc_94><loc_68><loc_96></location>Back cover</paragraph>
|
<paragraph><location><page_18><loc_47><loc_94><loc_68><loc_96></location>Back cover</paragraph>
|
||||||
<subtitle-level-1><location><page_18><loc_4><loc_82><loc_73><loc_91></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
|
<subtitle-level-1><location><page_18><loc_4><loc_82><loc_73><loc_91></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
|
||||||
|
@ -1601,6 +1601,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/8"
|
"$ref": "#/figures/8"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
91.85700199999997,
|
||||||
|
316.44727,
|
||||||
|
100.18200000000002
|
||||||
|
],
|
||||||
|
"page": 7,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
43
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 1-2 Existing row and column controls",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2375,6 +2398,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/9"
|
"$ref": "#/figures/9"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
369.53699,
|
||||||
|
341.97659,
|
||||||
|
377.862
|
||||||
|
],
|
||||||
|
"page": 10,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
42
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-1 CREATE PERMISSION SQL statement",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -2615,6 +2661,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/10"
|
"$ref": "#/figures/10"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
186.95709,
|
||||||
|
341.25662,
|
||||||
|
195.2821
|
||||||
|
],
|
||||||
|
"page": 11,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
50
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-5 Special registers and adopted authority",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -3200,6 +3269,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/11"
|
"$ref": "#/figures/11"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
64.800003,
|
||||||
|
610.13702,
|
||||||
|
293.13809,
|
||||||
|
618.46198
|
||||||
|
],
|
||||||
|
"page": 14,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
52
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-10 Column masks shown in System i Navigator",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -3458,6 +3550,29 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/12"
|
"$ref": "#/figures/12"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
64.800003,
|
||||||
|
134.63710000000003,
|
||||||
|
347.43054,
|
||||||
|
142.96210999999994
|
||||||
|
],
|
||||||
|
"page": 14,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
65
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 3-11 Selecting the EMPLOYEES table from System i Navigator",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
@ -3509,11 +3624,57 @@
|
|||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/13"
|
"$ref": "#/figures/13"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
136.8,
|
||||||
|
303.117,
|
||||||
|
327.09329,
|
||||||
|
311.44202
|
||||||
|
],
|
||||||
|
"page": 15,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
44
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 4-68 Visual Explain with RCAC enabled",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "Picture",
|
"name": "Picture",
|
||||||
"type": "figure",
|
"type": "figure",
|
||||||
"$ref": "#/figures/14"
|
"$ref": "#/figures/14"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"prov": [
|
||||||
|
{
|
||||||
|
"bbox": [
|
||||||
|
64.800003,
|
||||||
|
116.15710000000001,
|
||||||
|
227.10149,
|
||||||
|
124.48209999999995
|
||||||
|
],
|
||||||
|
"page": 15,
|
||||||
|
"span": [
|
||||||
|
0,
|
||||||
|
37
|
||||||
|
],
|
||||||
|
"__ref_s3_data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Figure 4-69 Index advice with no RCAC",
|
||||||
|
"type": "caption",
|
||||||
|
"payload": null,
|
||||||
|
"name": "Caption",
|
||||||
|
"font": null
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"prov": [
|
"prov": [
|
||||||
{
|
{
|
||||||
|
@ -336,8 +336,8 @@
|
|||||||
{
|
{
|
||||||
"page_no": 1,
|
"page_no": 1,
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 454.45458984375,
|
"t": 454.4546203613281,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 322.5054626464844,
|
"b": 322.5054626464844,
|
||||||
"coord_origin": "BOTTOMLEFT"
|
"coord_origin": "BOTTOMLEFT"
|
||||||
|
@ -2646,7 +2646,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373533129692078,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -2726,7 +2726,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -2881,7 +2881,7 @@
|
|||||||
"b": 255.42400999999995,
|
"b": 255.42400999999995,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.98504239320755,
|
"confidence": 0.9850425124168396,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 7,
|
"index": 7,
|
||||||
@ -3096,7 +3096,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591907262802124,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -3280,8 +3280,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -7787,7 +7787,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -7852,7 +7852,7 @@
|
|||||||
"b": 618.3,
|
"b": 618.3,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9849975109100342,
|
"confidence": 0.9849976301193237,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 93,
|
"index": 93,
|
||||||
@ -8184,8 +8184,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -13582,7 +13582,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373533129692078,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@ -13674,7 +13674,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -13841,7 +13841,7 @@
|
|||||||
"b": 255.42400999999995,
|
"b": 255.42400999999995,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.98504239320755,
|
"confidence": 0.9850425124168396,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 7,
|
"index": 7,
|
||||||
@ -14062,7 +14062,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591907262802124,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -14252,8 +14252,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -19642,7 +19642,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -19713,7 +19713,7 @@
|
|||||||
"b": 618.3,
|
"b": 618.3,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9849975109100342,
|
"confidence": 0.9849976301193237,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 93,
|
"index": 93,
|
||||||
@ -20057,7 +20057,7 @@
|
|||||||
"b": 152.90697999999998,
|
"b": 152.90697999999998,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9806433916091919,
|
"confidence": 0.9806435108184814,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 2,
|
"index": 2,
|
||||||
@ -20224,7 +20224,7 @@
|
|||||||
"b": 255.42400999999995,
|
"b": 255.42400999999995,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.98504239320755,
|
"confidence": 0.9850425124168396,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 7,
|
"index": 7,
|
||||||
@ -20445,7 +20445,7 @@
|
|||||||
"b": 327.98218,
|
"b": 327.98218,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9591909050941467,
|
"confidence": 0.9591907262802124,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 15,
|
"index": 15,
|
||||||
@ -20635,8 +20635,8 @@
|
|||||||
"id": 0,
|
"id": 0,
|
||||||
"label": "table",
|
"label": "table",
|
||||||
"bbox": {
|
"bbox": {
|
||||||
"l": 139.66741943359375,
|
"l": 139.6674041748047,
|
||||||
"t": 337.54541015625,
|
"t": 337.5453796386719,
|
||||||
"r": 475.00927734375,
|
"r": 475.00927734375,
|
||||||
"b": 469.4945373535156,
|
"b": 469.4945373535156,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
@ -26025,7 +26025,7 @@
|
|||||||
"b": 518.17419,
|
"b": 518.17419,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9589294195175171,
|
"confidence": 0.9589295387268066,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 91,
|
"index": 91,
|
||||||
@ -26096,7 +26096,7 @@
|
|||||||
"b": 618.3,
|
"b": 618.3,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9849975109100342,
|
"confidence": 0.9849976301193237,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 93,
|
"index": 93,
|
||||||
@ -26440,7 +26440,7 @@
|
|||||||
"b": 102.78223000000003,
|
"b": 102.78223000000003,
|
||||||
"coord_origin": "TOPLEFT"
|
"coord_origin": "TOPLEFT"
|
||||||
},
|
},
|
||||||
"confidence": 0.9373533725738525,
|
"confidence": 0.9373533129692078,
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
|
@ -0,0 +1,2 @@
|
|||||||
|
<doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||||
|
</doctag>
|
1
tests/data/webp/groundtruth/docling_v2/webp-test.json
Normal file
1
tests/data/webp/groundtruth/docling_v2/webp-test.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
1
tests/data/webp/groundtruth/docling_v2/webp-test.md
Normal file
1
tests/data/webp/groundtruth/docling_v2/webp-test.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
File diff suppressed because one or more lines are too long
BIN
tests/data/webp/webp-test.webp
Normal file
BIN
tests/data/webp/webp-test.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 29 KiB |
82
tests/test_backend_webp.py
Normal file
82
tests/test_backend_webp.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
OcrMacOptions,
|
||||||
|
OcrOptions,
|
||||||
|
RapidOcrOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
|
from docling.document_converter import DocumentConverter, ImageFormatOption
|
||||||
|
from tests.verify_utils import verify_conversion_result_v2
|
||||||
|
|
||||||
|
from .test_data_gen_flag import GEN_TEST_DATA
|
||||||
|
|
||||||
|
GENERATE = GEN_TEST_DATA
|
||||||
|
|
||||||
|
|
||||||
|
def get_webp_paths():
|
||||||
|
# Define the directory you want to search
|
||||||
|
directory = Path("./tests/data/webp/")
|
||||||
|
|
||||||
|
# List all WEBP files in the directory and its subdirectories
|
||||||
|
webp_files = sorted(directory.rglob("*.webp"))
|
||||||
|
return webp_files
|
||||||
|
|
||||||
|
|
||||||
|
def get_converter(ocr_options: OcrOptions):
|
||||||
|
image_format_option = ImageFormatOption()
|
||||||
|
image_format_option.pipeline_options.ocr_options = ocr_options
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={InputFormat.IMAGE: image_format_option},
|
||||||
|
allowed_formats=[InputFormat.IMAGE],
|
||||||
|
)
|
||||||
|
|
||||||
|
return converter
|
||||||
|
|
||||||
|
|
||||||
|
def test_e2e_webp_conversions():
|
||||||
|
webp_paths = get_webp_paths()
|
||||||
|
|
||||||
|
engines: List[OcrOptions] = [
|
||||||
|
EasyOcrOptions(),
|
||||||
|
TesseractOcrOptions(),
|
||||||
|
TesseractCliOcrOptions(),
|
||||||
|
EasyOcrOptions(force_full_page_ocr=True),
|
||||||
|
TesseractOcrOptions(force_full_page_ocr=True),
|
||||||
|
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
||||||
|
TesseractCliOcrOptions(force_full_page_ocr=True),
|
||||||
|
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
|
||||||
|
]
|
||||||
|
|
||||||
|
# rapidocr is only available for Python >=3.6,<3.13
|
||||||
|
if sys.version_info < (3, 13):
|
||||||
|
engines.append(RapidOcrOptions())
|
||||||
|
engines.append(RapidOcrOptions(force_full_page_ocr=True))
|
||||||
|
|
||||||
|
# only works on mac
|
||||||
|
if "darwin" == sys.platform:
|
||||||
|
engines.append(OcrMacOptions())
|
||||||
|
engines.append(OcrMacOptions(force_full_page_ocr=True))
|
||||||
|
for ocr_options in engines:
|
||||||
|
print(
|
||||||
|
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
|
||||||
|
)
|
||||||
|
converter = get_converter(ocr_options=ocr_options)
|
||||||
|
for webp_path in webp_paths:
|
||||||
|
print(f"converting {webp_path}")
|
||||||
|
|
||||||
|
doc_result: ConversionResult = converter.convert(webp_path)
|
||||||
|
|
||||||
|
verify_conversion_result_v2(
|
||||||
|
input_path=webp_path,
|
||||||
|
doc_result=doc_result,
|
||||||
|
generate=GENERATE,
|
||||||
|
fuzzy=True,
|
||||||
|
)
|
29
tests/test_settings_load.py
Normal file
29
tests/test_settings_load.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_env():
|
||||||
|
os.environ["DOCLING_PERF_PAGE_BATCH_SIZE"] = "12"
|
||||||
|
os.environ["DOCLING_DEBUG_VISUALIZE_RAW_LAYOUT"] = "True"
|
||||||
|
os.environ["DOCLING_ARTIFACTS_PATH"] = "/path/to/artifacts"
|
||||||
|
|
||||||
|
|
||||||
|
def test_settings():
|
||||||
|
_setup_env()
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
import docling.datamodel.settings as m
|
||||||
|
|
||||||
|
# Reinitialize settings module
|
||||||
|
importlib.reload(m)
|
||||||
|
|
||||||
|
# Check top level setting
|
||||||
|
assert str(m.settings.artifacts_path) == "/path/to/artifacts"
|
||||||
|
|
||||||
|
# Check nested set via environment variables
|
||||||
|
assert m.settings.perf.page_batch_size == 12
|
||||||
|
assert m.settings.debug.visualize_raw_layout is True
|
||||||
|
|
||||||
|
# Check nested defaults
|
||||||
|
assert m.settings.perf.doc_batch_size == 2
|
||||||
|
assert m.settings.debug.visualize_ocr is False
|
@ -462,7 +462,7 @@ def verify_conversion_result_v2(
|
|||||||
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
||||||
if not os.path.exists(gtfile) or generate:
|
if not os.path.exists(gtfile) or generate:
|
||||||
with open(gtfile, "w") as fw:
|
with open(gtfile, "w") as fw:
|
||||||
json.dump(pred_doc.export_to_dict(), fw, indent=2)
|
json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user