Merge pull request #556 from DS4SD/cau/layout-processing-improvement

feat: layout processing improvements and bugfixes
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-12-10 16:29:07 +01:00 committed by GitHub
parent 184eed4095
commit b66fb830c9
52 changed files with 401 additions and 739 deletions

View File

@ -1,3 +1,14 @@
## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
### Feature
* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170))
### Fix
* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2))
* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96))
## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
### Feature

View File

@ -29,8 +29,10 @@ from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
OcrEngine,
OcrMacOptions,
OcrOptions,
PdfBackend,
PdfPipelineOptions,
RapidOcrOptions,
TableFormerMode,
@ -70,22 +72,6 @@ def version_callback(value: bool):
raise typer.Exit()
# Define an enum for the backend options
class PdfBackend(str, Enum):
PYPDFIUM2 = "pypdfium2"
DLPARSE_V1 = "dlparse_v1"
DLPARSE_V2 = "dlparse_v2"
# Define an enum for the ocr engines
class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,

View File

@ -19,12 +19,12 @@ if TYPE_CHECKING:
class ConversionStatus(str, Enum):
PENDING = auto()
STARTED = auto()
FAILURE = auto()
SUCCESS = auto()
PARTIAL_SUCCESS = auto()
SKIPPED = auto()
PENDING = "pending"
STARTED = "started"
FAILURE = "failure"
SUCCESS = "success"
PARTIAL_SUCCESS = "partial_success"
SKIPPED = "skipped"
class InputFormat(str, Enum):
@ -89,15 +89,15 @@ MimeTypeToFormat = {
class DocInputType(str, Enum):
PATH = auto()
STREAM = auto()
PATH = "path"
STREAM = "stream"
class DoclingComponentType(str, Enum):
DOCUMENT_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
USER_INPUT = auto()
DOCUMENT_BACKEND = "document_backend"
MODEL = "model"
DOC_ASSEMBLER = "doc_assembler"
USER_INPUT = "user_input"
class ErrorItem(BaseModel):

View File

@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
from typing_extensions import deprecated
@ -191,259 +192,7 @@ class ConversionResult(BaseModel):
@property
@deprecated("Use document instead.")
def legacy_document(self):
reverse_label_mapping = {
DocItemLabel.CAPTION.value: "Caption",
DocItemLabel.FOOTNOTE.value: "Footnote",
DocItemLabel.FORMULA.value: "Formula",
DocItemLabel.LIST_ITEM.value: "List-item",
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
DocItemLabel.PAGE_HEADER.value: "Page-header",
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER.value: "Section-header",
DocItemLabel.TABLE.value: "Table",
DocItemLabel.TEXT.value: "Text",
DocItemLabel.TITLE.value: "Title",
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
DocItemLabel.CODE.value: "Code",
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
DocItemLabel.FORM.value: "Form",
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
DocItemLabel.PARAGRAPH.value: "paragraph",
}
title = ""
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
page=p.page_no,
model="default",
)
for p in self.document.pages.values()
]
file_info = DsFileInfoObject(
filename=self.input.file.name,
document_hash=self.input.document_hash,
num_pages=self.input.page_count,
page_hashes=page_hashes,
)
main_text = []
tables = []
figures = []
equations = []
footnotes = []
page_headers = []
page_footers = []
embedded_captions = set()
for ix, (item, level) in enumerate(
self.document.iterate_items(self.document.body)
):
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
caption = item.caption_text(self.document)
if caption:
embedded_captions.add(caption)
for item, level in self.document.iterate_items():
if isinstance(item, DocItem):
item_type = item.label
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
if isinstance(item, ListItem) and item.marker:
text = f"{item.marker} {item.text}"
else:
text = item.text
# Can be empty.
prov = [
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(item.text)],
)
for p in item.prov
]
main_text.append(
BaseText(
text=text,
obj_type=layout_label_to_ds_type.get(item.label),
name=reverse_label_mapping[item.label],
prov=prov,
)
)
# skip captions of they are embedded in the actual
# floating object
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
continue
elif isinstance(item, TableItem) and item.data:
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
)
for j in range(item.data.num_cols)
]
for i in range(item.data.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in item.data.table_cells:
for i in range(
min(cell.start_row_offset_idx, item.data.num_rows),
min(cell.end_row_offset_idx, item.data.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, item.data.num_cols),
min(cell.end_col_offset_idx, item.data.num_cols),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
elif cell.row_section:
celltype = "row_section"
def make_spans(cell):
for rspan in range(
min(
cell.start_row_offset_idx,
item.data.num_rows,
),
min(
cell.end_row_offset_idx, item.data.num_rows
),
):
for cspan in range(
min(
cell.start_col_offset_idx,
item.data.num_cols,
),
min(
cell.end_col_offset_idx,
item.data.num_cols,
),
):
yield [rspan, cspan]
spans = list(make_spans(cell))
table_data[i][j] = GlmTableCell(
text=cell.text,
bbox=(
cell.bbox.as_tuple()
if cell.bbox is not None
else None
), # check if this is bottom-left
spans=spans,
obj_type=celltype,
col=j,
row=i,
row_header=cell.row_header,
row_section=cell.row_section,
col_header=cell.column_header,
row_span=[
cell.start_row_offset_idx,
cell.end_row_offset_idx,
],
col_span=[
cell.start_col_offset_idx,
cell.end_col_offset_idx,
],
)
# Compute the caption
caption = item.caption_text(self.document)
tables.append(
DsSchemaTable(
text=caption,
num_cols=item.data.num_cols,
num_rows=item.data.num_rows,
obj_type=layout_label_to_ds_type.get(item.label),
data=table_data,
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, 0],
)
for p in item.prov
],
)
)
elif isinstance(item, PictureItem):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
# Compute the caption
caption = item.caption_text(self.document)
figures.append(
Figure(
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(caption)],
)
for p in item.prov
],
obj_type=layout_label_to_ds_type.get(item.label),
text=caption,
# data=[[]],
)
)
page_dimensions = [
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
for p in self.document.pages.values()
]
ds_doc = DsDocument(
name=title,
description=desc,
file_info=file_info,
main_text=main_text,
equations=equations,
footnotes=footnotes,
page_headers=page_headers,
page_footers=page_footers,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
)
return ds_doc
return docling_document_to_legacy(self.document)
class _DummyBackend(AbstractDocumentBackend):

View File

@ -190,6 +190,26 @@ class OcrMacOptions(OcrOptions):
)
# Define an enum for the backend options
class PdfBackend(str, Enum):
"""Enum of valid PDF backends."""
PYPDFIUM2 = "pypdfium2"
DLPARSE_V1 = "dlparse_v1"
DLPARSE_V2 = "dlparse_v2"
# Define an enum for the ocr engines
class OcrEngine(str, Enum):
"""Enum of valid OCR engines."""
EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"
class PipelineOptions(BaseModel):
"""Base pipeline options."""

View File

@ -80,7 +80,7 @@ class LayoutModel(BasePageModel):
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
DocItemLabel.CODE: (255, 223, 186), # Peach
DocItemLabel.CODE: (125, 125, 125), # Gray
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
DocItemLabel.FORM: (200, 255, 255), # Light Cyan

View File

@ -97,24 +97,25 @@ class RapidOcrModel(BaseOcrModel):
del high_res_image
del im
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
if result is not None:
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)

View File

@ -71,6 +71,10 @@ class TableStructureModel(BasePageModel):
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
for cell in table_element.cluster.cells:
x0, y0, x1, y1 = cell.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
for tc in table_element.table_cells:
if tc.bbox is not None:
x0, y0, x1, y1 = tc.bbox.as_tuple()
@ -84,7 +88,6 @@ class TableStructureModel(BasePageModel):
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
fill="black",
)
if show:
image.show()
else:
@ -136,41 +139,33 @@ class TableStructureModel(BasePageModel):
yield page
continue
tokens = []
for c in page.cells:
for cluster, _ in in_tables:
if c.bbox.area() > 0:
if (
c.bbox.intersection_area_with(cluster.bbox)
/ c.bbox.area()
> 0.2
):
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.bbox = new_cell.bbox.scaled(
scale=self.scale
)
tokens.append(new_cell.model_dump())
page_input = {
"tokens": tokens,
"width": page.size.width * self.scale,
"height": page.size.height * self.scale,
"image": numpy.asarray(page.get_image(scale=self.scale)),
}
page_input["image"] = numpy.asarray(
page.get_image(scale=self.scale)
)
table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes):
tf_output = self.tf_predictor.multi_table_predict(
page_input, table_bboxes, do_matching=self.do_cell_matching
)
for table_cluster, tbl_box in in_tables:
for table_cluster, table_out in zip(table_clusters, tf_output):
tokens = []
for c in table_cluster.cells:
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.bbox = new_cell.bbox.scaled(
scale=self.scale
)
tokens.append(new_cell.model_dump())
page_input["tokens"] = tokens
tf_output = self.tf_predictor.multi_table_predict(
page_input, [tbl_box], do_matching=self.do_cell_matching
)
table_out = tf_output[0]
table_cells = []
for element in table_out["tf_responses"]:

View File

@ -156,16 +156,16 @@ class LayoutPostprocessor:
SPECIAL_TYPES = WRAPPER_TYPES | {DocItemLabel.PICTURE}
CONFIDENCE_THRESHOLDS = {
DocItemLabel.CAPTION: 0.35,
DocItemLabel.FOOTNOTE: 0.35,
DocItemLabel.FORMULA: 0.35,
DocItemLabel.LIST_ITEM: 0.35,
DocItemLabel.PAGE_FOOTER: 0.35,
DocItemLabel.PAGE_HEADER: 0.35,
DocItemLabel.PICTURE: 0.1,
DocItemLabel.CAPTION: 0.5,
DocItemLabel.FOOTNOTE: 0.5,
DocItemLabel.FORMULA: 0.5,
DocItemLabel.LIST_ITEM: 0.5,
DocItemLabel.PAGE_FOOTER: 0.5,
DocItemLabel.PAGE_HEADER: 0.5,
DocItemLabel.PICTURE: 0.5,
DocItemLabel.SECTION_HEADER: 0.45,
DocItemLabel.TABLE: 0.35,
DocItemLabel.TEXT: 0.45,
DocItemLabel.TEXT: 0.55, # 0.45,
DocItemLabel.TITLE: 0.45,
DocItemLabel.CODE: 0.45,
DocItemLabel.CHECKBOX_SELECTED: 0.45,
@ -218,6 +218,12 @@ class LayoutPostprocessor:
final_clusters = self._sort_clusters(
self.regular_clusters + self.special_clusters
)
for cluster in final_clusters:
cluster.cells = self._sort_cells(cluster.cells)
# Also sort cells in children if any
for child in cluster.children:
child.cells = self._sort_cells(child.cells)
return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]:
@ -273,6 +279,8 @@ class LayoutPostprocessor:
if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
]
special_clusters = self._handle_cross_type_overlaps(special_clusters)
for special in special_clusters:
contained = []
for cluster in self.regular_clusters:
@ -283,14 +291,17 @@ class LayoutPostprocessor:
contained.append(cluster)
if contained:
# Sort contained clusters by minimum cell ID
contained.sort(
key=lambda cluster: (
min(cell.id for cell in cluster.cells)
if cluster.cells
else sys.maxsize
)
)
# # Sort contained clusters by minimum cell ID:
# contained.sort(
# key=lambda cluster: (
# min(cell.id for cell in cluster.cells)
# if cluster.cells
# else sys.maxsize
# )
# )
# Sort contained clusters left-to-right, top-to-bottom
contained = self._sort_clusters(contained)
special.children = contained
# Adjust bbox only for wrapper types
@ -318,6 +329,109 @@ class LayoutPostprocessor:
return picture_clusters + wrapper_clusters
def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
"""Handle overlaps between regular and wrapper clusters before child assignment.
In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
should be removed.
"""
wrappers_to_remove = set()
for wrapper in special_clusters:
if wrapper.label != DocItemLabel.KEY_VALUE_REGION:
continue # only treat KEY_VALUE_REGION for now.
for regular in self.regular_clusters:
if regular.label == DocItemLabel.TABLE:
# Calculate overlap
overlap = regular.bbox.intersection_area_with(wrapper.bbox)
wrapper_area = wrapper.bbox.area()
overlap_ratio = overlap / wrapper_area
# If wrapper is mostly overlapping with a TABLE, remove the wrapper
if overlap_ratio > 0.8: # 80% overlap threshold
wrappers_to_remove.add(wrapper.id)
break
# Filter out the identified wrappers
special_clusters = [
cluster
for cluster in special_clusters
if cluster.id not in wrappers_to_remove
]
return special_clusters
def _should_prefer_cluster(
self, candidate: Cluster, other: Cluster, params: dict
) -> bool:
"""Determine if candidate cluster should be preferred over other cluster based on rules.
Returns True if candidate should be preferred, False if not."""
# Rule 1: LIST_ITEM vs TEXT
if (
candidate.label == DocItemLabel.LIST_ITEM
and other.label == DocItemLabel.TEXT
):
# Check if areas are similar (within 20% of each other)
area_ratio = candidate.bbox.area() / other.bbox.area()
area_similarity = abs(1 - area_ratio) < 0.2
if area_similarity:
return True
# Rule 2: CODE vs others
if candidate.label == DocItemLabel.CODE:
# Calculate how much of the other cluster is contained within the CODE cluster
overlap = other.bbox.intersection_area_with(candidate.bbox)
containment = overlap / other.bbox.area()
if containment > 0.8: # other is 80% contained within CODE
return True
# If no label-based rules matched, fall back to area/confidence thresholds
area_ratio = candidate.bbox.area() / other.bbox.area()
conf_diff = other.confidence - candidate.confidence
if (
area_ratio <= params["area_threshold"]
and conf_diff > params["conf_threshold"]
):
return False
return True # Default to keeping candidate if no rules triggered rejection
def _select_best_cluster_from_group(
self,
group_clusters: List[Cluster],
params: dict,
) -> Cluster:
"""Select best cluster from a group of overlapping clusters based on all rules."""
current_best = None
for candidate in group_clusters:
should_select = True
for other in group_clusters:
if other == candidate:
continue
if not self._should_prefer_cluster(candidate, other, params):
should_select = False
break
if should_select:
if current_best is None:
current_best = candidate
else:
# If both clusters pass rules, prefer the larger one unless confidence differs significantly
if (
candidate.bbox.area() > current_best.bbox.area()
and current_best.confidence - candidate.confidence
<= params["conf_threshold"]
):
current_best = candidate
return current_best if current_best else group_clusters[0]
def _remove_overlapping_clusters(
self,
clusters: List[Cluster],
@ -360,36 +474,15 @@ class LayoutPostprocessor:
continue
group_clusters = [valid_clusters[cid] for cid in group]
current_best = None
best = self._select_best_cluster_from_group(group_clusters, params)
for candidate in group_clusters:
should_select = True
for other in group_clusters:
if other == candidate:
continue
area_ratio = candidate.bbox.area() / other.bbox.area()
conf_diff = other.confidence - candidate.confidence
if (
area_ratio <= params["area_threshold"]
and conf_diff > params["conf_threshold"]
):
should_select = False
break
if should_select:
if current_best is None or (
candidate.bbox.area() > current_best.bbox.area()
and current_best.confidence - candidate.confidence
<= params["conf_threshold"]
):
current_best = candidate
best = current_best if current_best else group_clusters[0]
# Simple cell merging - no special cases
for cluster in group_clusters:
if cluster != best:
best.cells.extend(cluster.cells)
best.cells = self._deduplicate_cells(best.cells)
best.cells = self._sort_cells(best.cells)
result.append(best)
return result
@ -424,6 +517,16 @@ class LayoutPostprocessor:
return current_best if current_best else clusters[0]
def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
"""Ensure each cell appears only once, maintaining order of first appearance."""
seen_ids = set()
unique_cells = []
for cell in cells:
if cell.id not in seen_ids:
seen_ids.add(cell.id)
unique_cells.append(cell)
return unique_cells
def _assign_cells_to_clusters(
self, clusters: List[Cluster], min_overlap: float = 0.2
) -> List[Cluster]:
@ -452,6 +555,10 @@ class LayoutPostprocessor:
if best_cluster is not None:
best_cluster.cells.append(cell)
# Deduplicate cells in each cluster after assignment
for cluster in clusters:
cluster.cells = self._deduplicate_cells(cluster.cells)
return clusters
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
@ -487,13 +594,10 @@ class LayoutPostprocessor:
return clusters
def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
"""Sort cells in native reading order."""
return sorted(cells, key=lambda c: (c.id))
def _sort_clusters(self, clusters: List[Cluster]) -> List[Cluster]:
"""Sort clusters in reading order (top-to-bottom, left-to-right)."""
def reading_order_key(cluster: Cluster) -> Tuple[float, float]:
if cluster.cells and cluster.label != DocItemLabel.PICTURE:
first_cell = min(cluster.cells, key=lambda c: (c.bbox.t, c.bbox.l))
return (first_cell.bbox.t, first_cell.bbox.l)
return (cluster.bbox.t, cluster.bbox.l)
return sorted(clusters, key=reading_order_key)
return sorted(clusters, key=lambda cluster: (cluster.bbox.t, cluster.bbox.l))

View File

@ -10,7 +10,7 @@ For each document format, the *document converter* knows which format-specific *
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a *chunker*.
Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md).
For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).

View File

@ -1,4 +1,4 @@
# CLI Reference
# CLI reference
This page provides documentation for our command line tools.

View File

@ -22,9 +22,7 @@ A simple example would look like this:
docling https://arxiv.org/pdf/2206.01062
```
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./cli.md).
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
### Advanced options
@ -130,29 +128,37 @@ You can limit the CPU threads used by Docling by setting the environment variabl
## Chunking
You can perform a hierarchy-aware chunking of a Docling document as follows:
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
`HybridChunker`, as shown below (for more details check out
[this example](examples/hybrid_chunking.ipynb)):
```python
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
from docling.chunking import HybridChunker
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
doc = conv_res.document
chunks = list(HierarchicalChunker().chunk(doc))
print(chunks[30])
chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5") # set tokenizer as needed
chunk_iter = chunker.chunk(doc)
```
An example chunk would look like this:
```python
print(list(chunk_iter)[11])
# {
# "text": "Lately, new types of ML models for document-layout analysis have emerged [...]",
# "text": "In this paper, we present the DocLayNet dataset. [...]",
# "meta": {
# "doc_items": [{
# "self_ref": "#/texts/40",
# "self_ref": "#/texts/28",
# "label": "text",
# "prov": [{
# "page_no": 2,
# "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...},
# }]
# }],
# "headings": ["2 RELATED WORK"],
# "bbox": {"l": 53.29, "t": 287.14, "r": 295.56, "b": 212.37, ...},
# }], ...,
# }, ...],
# "headings": ["1 INTRODUCTION"],
# }
# }
```

View File

@ -56,7 +56,6 @@ nav:
- "Docling": index.md
- Installation: installation.md
- Usage: usage.md
- CLI: cli.md
- FAQ: faq.md
- Docling v2: v2.md
- Concepts:
@ -77,15 +76,12 @@ nav:
- "Multimodal export": examples/export_multimodal.py
- "Force full page OCR": examples/full_page_ocr.py
- "Accelerator options": examples/run_with_acclerators.py
- Chunking:
- "Hybrid chunking": examples/hybrid_chunking.ipynb
- RAG / QA:
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
- "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
- Chunking:
- "Hybrid chunking": examples/hybrid_chunking.ipynb
# - Chunking: examples/chunking.md
# - CLI:
# - CLI: examples/cli.md
- Integrations:
- Integrations: integrations/index.md
- "🐝 Bee": integrations/bee.md
@ -100,10 +96,13 @@ nav:
- "spaCy": integrations/spacy.md
- "txtai": integrations/txtai.md
# - "LangChain 🦜🔗": integrations/langchain.md
- API reference:
- Document Converter: api_reference/document_converter.md
- Pipeline options: api_reference/pipeline_options.md
- Docling Document: api_reference/docling_document.md
- Reference:
- Python API:
- Document Converter: reference/document_converter.md
- Pipeline options: reference/pipeline_options.md
- Docling Document: reference/docling_document.md
- CLI:
- CLI reference: reference/cli.md
markdown_extensions:
- pymdownx.superfences

73
poetry.lock generated
View File

@ -922,27 +922,29 @@ name = "docling-core"
version = "2.9.0"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-2.9.0-py3-none-any.whl", hash = "sha256:b44b077db5d2ac8a900f30a15abe329c165b1f2eb7f1c90d1275c423c1c3d668"},
{file = "docling_core-2.9.0.tar.gz", hash = "sha256:1bf12fe67ee4852330e9bac33fe62b45598ff885481e03a88fa8e1bf48252424"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
jsonref = ">=1.1.0,<2.0.0"
jsonschema = ">=4.16.0,<5.0.0"
pandas = ">=2.1.4,<3.0.0"
pillow = ">=10.3.0,<11.0.0"
pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0"
jsonref = "^1.1.0"
jsonschema = "^4.16.0"
pandas = "^2.1.4"
pillow = "^10.3.0"
pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2"
pyyaml = ">=5.1,<7.0.0"
semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""}
tabulate = ">=0.9.0,<0.10.0"
transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""}
typing-extensions = ">=4.12.2,<5.0.0"
tabulate = "^0.9.0"
typing-extensions = "^4.12.2"
[package.extras]
chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/docling-core.git"
reference = "cau/include-picture-contents"
resolved_reference = "012f8ac38a2ba7e77110b3f7ad57af2a984232e5"
[[package]]
name = "docling-ibm-models"
version = "2.0.7"
@ -2855,32 +2857,6 @@ files = [
{file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"},
]
[[package]]
name = "mpire"
version = "2.10.2"
description = "A Python package for easy multiprocessing, but faster than multiprocessing"
optional = false
python-versions = "*"
files = [
{file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"},
{file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"},
]
[package.dependencies]
multiprocess = [
{version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
]
pygments = ">=2.0"
pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
tqdm = ">=4.27"
[package.extras]
dashboard = ["flask"]
dill = ["multiprocess", "multiprocess (>=0.70.15)"]
docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"]
testing = ["ipywidgets", "multiprocess", "multiprocess (>=0.70.15)", "numpy", "pywin32 (>=301)", "rich"]
[[package]]
name = "mpmath"
version = "1.3.0"
@ -6170,21 +6146,6 @@ files = [
cryptography = ">=2.0"
jeepney = ">=0.6"
[[package]]
name = "semchunk"
version = "2.2.0"
description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks."
optional = false
python-versions = ">=3.9"
files = [
{file = "semchunk-2.2.0-py3-none-any.whl", hash = "sha256:7db19ca90ddb48f99265e789e07a7bb111ae25185f9cc3d44b94e1e61b9067fc"},
{file = "semchunk-2.2.0.tar.gz", hash = "sha256:4de761ce614036fa3bea61adbe47e3ade7c96ac9b062f223b3ac353dbfd26743"},
]
[package.dependencies]
mpire = {version = "*", extras = ["dill"]}
tqdm = "*"
[[package]]
name = "semver"
version = "2.13.0"
@ -7723,4 +7684,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "6917af8d76aa1f85a159f0ab9546478b4bef194ae726c79196bac087c7368fef"
content-hash = "c991515ef231d9eeead33cc876e8cb93fe31e949a5ab92918a4b77257d2700a3"

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
version = "2.9.0" # DO NOT EDIT, updated automatically
version = "2.10.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT"
@ -28,7 +28,8 @@ python = "^3.9"
docling-ibm-models = { git = "ssh://git@github.com/DS4SD/docling-ibm-models.git", branch = "nli/performance" }
deepsearch-glm = "^1.0.0"
docling-parse = "^3.0.0"
docling-core = { version = "^2.9.0", extras = ["chunking"] }
#docling-core = { version = "^2.9.0", extras = ["chunking"] }
docling-core = { git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/include-picture-contents" }
pydantic = "^2.0.0"
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"

View File

@ -153,41 +153,20 @@
</table>
<paragraph><location><page_8><loc_9><loc_89><loc_10><loc_90></location>- a.</paragraph>
<paragraph><location><page_8><loc_11><loc_89><loc_82><loc_90></location>- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</paragraph>
<paragraph><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</paragraph>
<paragraph><location><page_8><loc_50><loc_87><loc_70><loc_88></location>Example table from FinTabNet:</paragraph>
<figure>
<location><page_8><loc_8><loc_76><loc_49><loc_87></location>
</figure>
<caption><location><page_8><loc_9><loc_87><loc_70><loc_88></location>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:</caption>
<caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
<figure>
<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
<location><page_8><loc_8><loc_76><loc_49><loc_87></location>
<caption>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
</figure>
<table>
<figure>
<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
</table>
</figure>
<caption><location><page_8><loc_62><loc_62><loc_90><loc_63></location>Text is aligned to match original for ease of viewing</caption>
<table>
<figure>
<location><page_8><loc_50><loc_64><loc_90><loc_72></location>
<caption>Text is aligned to match original for ease of viewing</caption>
<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
</table>
</figure>
<caption><location><page_8><loc_8><loc_54><loc_89><loc_59></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
<figure>
<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
@ -296,7 +275,7 @@
<paragraph><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</paragraph>
<caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
<figure>
<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
<location><page_13><loc_51><loc_63><loc_91><loc_87></location>
<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
</figure>
<caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
@ -319,7 +298,11 @@
<location><page_14><loc_52><loc_55><loc_87><loc_89></location>
<caption>Figure 13: Table predictions example on colorful table.</caption>
</figure>
<paragraph><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</paragraph>
<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
<figure>
<location><page_14><loc_52><loc_25><loc_85><loc_31></location>
<caption>Figure 14: Example with multi-line text.</caption>
</figure>
<figure>
<location><page_15><loc_9><loc_69><loc_46><loc_83></location>
</figure>
@ -335,6 +318,9 @@
<caption>Figure 15: Example with triangular table.</caption>
</figure>
<figure>
<location><page_15><loc_53><loc_72><loc_86><loc_85></location>
</figure>
<figure>
<location><page_15><loc_53><loc_41><loc_86><loc_54></location>
</figure>
<caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>

File diff suppressed because one or more lines are too long

View File

@ -219,40 +219,18 @@ Table 4: Results of structure with content retrieved using cell detection on Pub
- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells
Japanese language (previously unseen by TableFormer):
Example table from FinTabNet:
<!-- image -->
Japanese language (previously unseen by TableFormer): Example table from FinTabNet:
b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
<!-- image -->
| | | 論文ファイル | 論文ファイル | 参考文献 | 参考文献 |
|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
| 出典 | ファイル 数 | 英語 | 日本語 | 英語 | 日本語 |
| Association for Computational Linguistics(ACL2003) | 65 | 65 | 0 | 150 | 0 |
| Computational Linguistics(COLING2002) | 140 | 140 | 0 | 150 | 0 |
| 電気情報通信学会 2003 年総合大会 | 150 | 8 | 142 | 223 | 147 |
| 情報処理学会第 65 回全国大会 (2003) | 177 | 1 | 176 | 150 | 236 |
| 第 17 回人工知能学会全国大会 (2003) | 208 | 5 | 203 | 152 | 244 |
| 自然言語処理研究会第 146 〜 155 回 | 98 | 2 | 96 | 150 | 232 |
| WWW から収集した論文 | 107 | 73 | 34 | 147 | 96 |
| | 945 | 294 | 651 | 1122 | 955 |
<!-- image -->
Text is aligned to match original for ease of viewing
| | Shares (in millions) | Shares (in millions) | Weighted Average Grant Date Fair Value | Weighted Average Grant Date Fair Value |
|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
| | RS U s | PSUs | RSUs | PSUs |
| Nonvested on Janua ry 1 | 1. 1 | 0.3 | 90.10 $ | $ 91.19 |
| Granted | 0. 5 | 0.1 | 117.44 | 122.41 |
| Vested | (0. 5 ) | (0.1) | 87.08 | 81.14 |
| Canceled or forfeited | (0. 1 ) | - | 102.01 | 92.18 |
| Nonvested on December 31 | 1.0 | 0.3 | 104.85 $ | $ 104.51 |
<!-- image -->
Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
<!-- image -->
@ -458,6 +436,7 @@ Figure 13: Table predictions example on colorful table.
<!-- image -->
Figure 14: Example with multi-line text.
<!-- image -->
<!-- image -->
@ -472,6 +451,9 @@ Figure 15: Example with triangular table.
<!-- image -->
<!-- image -->
<!-- image -->
Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.

File diff suppressed because one or more lines are too long

View File

@ -3,17 +3,16 @@
<paragraph><location><page_1><loc_15><loc_77><loc_32><loc_83></location>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</paragraph>
<paragraph><location><page_1><loc_42><loc_77><loc_58><loc_83></location>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</paragraph>
<paragraph><location><page_1><loc_69><loc_77><loc_85><loc_83></location>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</paragraph>
<paragraph><location><page_1><loc_28><loc_70><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</paragraph>
<paragraph><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</paragraph>
<paragraph><location><page_1><loc_28><loc_71><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland</paragraph>
<paragraph><location><page_1><loc_29><loc_70><loc_44><loc_71></location>ahn@zurich.ibm.com</paragraph>
<subtitle-level-1><location><page_1><loc_9><loc_67><loc_18><loc_69></location>ABSTRACT</subtitle-level-1>
<paragraph><location><page_1><loc_9><loc_33><loc_48><loc_67></location>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</paragraph>
<subtitle-level-1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</subtitle-level-1>
<paragraph><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</paragraph>
<paragraph><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</paragraph>
<paragraph><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</paragraph>
<paragraph><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</paragraph>
<paragraph><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</paragraph>
<paragraph><location><page_1><loc_9><loc_12><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.</paragraph>
<paragraph><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</paragraph>
<paragraph><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</paragraph>
<caption><location><page_1><loc_52><loc_29><loc_91><loc_32></location>Figure 1: Four examples of complex page layouts across different document categories</caption>
<figure>
<location><page_1><loc_53><loc_34><loc_90><loc_68></location>

File diff suppressed because one or more lines are too long

View File

@ -6,9 +6,9 @@ Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
Ahmed S. Nassar IBM Research Rueschlikon, Switzerland
Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
ahn@zurich.ibm.com
## ABSTRACT
@ -20,14 +20,12 @@ Accurate document layout analysis is a key requirement for highquality PDF docum
Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).
KDD '22, August 14-18, 2022, Washington, DC, USA
© 2022 Copyright held by the owner/author(s).
ACM ISBN 978-1-4503-9385-0/22/08.
KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.
https://doi.org/10.1145/3534678.3539043
Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
Figure 1: Four examples of complex page layouts across different document categories
<!-- image -->

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
<document>
<subtitle-level-1><location><page_1><loc_22><loc_82><loc_79><loc_85></location>Optimized Table Tokenization for Table Structure Recognition</subtitle-level-1>
<paragraph><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]</paragraph>
<paragraph><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 0002 3723 $^{6960]}$, Ahmed Nassar[0000 0002 9468 $^{0822]}$, Nikolaos Livathinos [0000 0001 8513 $^{3491]}$, Christoph Auer[0000 0001 5761 $^{0422]}$, [0000 0002 8088 0823]</paragraph>
<paragraph><location><page_1><loc_38><loc_74><loc_49><loc_75></location>and Peter Staar</paragraph>
<paragraph><location><page_1><loc_46><loc_72><loc_55><loc_73></location>IBM Research</paragraph>
<paragraph><location><page_1><loc_36><loc_70><loc_64><loc_71></location>{mly,ahn,nli,cau,taa}@zurich.ibm.com</paragraph>

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
## Optimized Table Tokenization for Table Structure Recognition
Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]
Maksym Lysak [0000 0002 3723 $^{6960]}$, Ahmed Nassar[0000 0002 9468 $^{0822]}$, Nikolaos Livathinos [0000 0001 8513 $^{3491]}$, Christoph Auer[0000 0001 5761 $^{0422]}$, [0000 0002 8088 0823]
and Peter Staar

File diff suppressed because one or more lines are too long

View File

@ -5,10 +5,7 @@
</figure>
<subtitle-level-1><location><page_1><loc_6><loc_79><loc_96><loc_89></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
<figure>
<location><page_1><loc_5><loc_11><loc_96><loc_63></location>
</figure>
<figure>
<location><page_1><loc_52><loc_2><loc_95><loc_10></location>
<location><page_1><loc_3><loc_1><loc_96><loc_64></location>
</figure>
<subtitle-level-1><location><page_2><loc_11><loc_88><loc_28><loc_91></location>Contents</subtitle-level-1>
<table>
@ -105,7 +102,9 @@
<location><page_5><loc_5><loc_70><loc_39><loc_91></location>
</figure>
<paragraph><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</paragraph>
<paragraph><location><page_5><loc_82><loc_84><loc_85><loc_88></location>1</paragraph>
<figure>
<location><page_5><loc_78><loc_82><loc_89><loc_91></location>
</figure>
<subtitle-level-1><location><page_5><loc_22><loc_61><loc_89><loc_68></location>Securing and protecting IBM DB2 data</subtitle-level-1>
<paragraph><location><page_5><loc_22><loc_46><loc_89><loc_56></location>Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.</paragraph>
<paragraph><location><page_5><loc_22><loc_38><loc_86><loc_44></location>Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement.</paragraph>
@ -155,17 +154,7 @@
</table>
<paragraph><location><page_8><loc_22><loc_40><loc_89><loc_43></location>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</paragraph>
<paragraph><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</paragraph>
<paragraph><location><page_8><loc_22><loc_35><loc_28><loc_36></location>SELECT</paragraph>
<paragraph><location><page_8><loc_30><loc_35><loc_41><loc_36></location>function_id,</paragraph>
<paragraph><location><page_8><loc_27><loc_34><loc_39><loc_35></location>user_name,</paragraph>
<paragraph><location><page_8><loc_28><loc_32><loc_36><loc_33></location>usage,</paragraph>
<paragraph><location><page_8><loc_27><loc_31><loc_39><loc_32></location>user_type</paragraph>
<paragraph><location><page_8><loc_22><loc_29><loc_26><loc_30></location>FROM</paragraph>
<paragraph><location><page_8><loc_29><loc_29><loc_43><loc_30></location>function_usage</paragraph>
<paragraph><location><page_8><loc_22><loc_28><loc_27><loc_29></location>WHERE</paragraph>
<paragraph><location><page_8><loc_29><loc_28><loc_54><loc_29></location>function_id=QIBM_DB_SECADM</paragraph>
<paragraph><location><page_8><loc_22><loc_26><loc_29><loc_27></location>ORDER BY</paragraph>
<paragraph><location><page_8><loc_31><loc_26><loc_39><loc_27></location>user_name;</paragraph>
<table><location><page_8><loc_22><loc_26><loc_89><loc_37></location>SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=QIBM_DB_SECADM ORDER BY user_name;</table>
<subtitle-level-1><location><page_8><loc_11><loc_20><loc_41><loc_22></location>2.2 Separation of duties</subtitle-level-1>
<paragraph><location><page_8><loc_22><loc_10><loc_89><loc_18></location>Separation of duties helps businesses comply with industry regulations or organizational requirements and simplifies the management of authorities. Separation of duties is commonly used to prevent fraudulent activities or errors by a single person. It provides the ability for administrative functions to be divided across individuals without overlapping responsibilities, so that one user does not possess unlimited authority, such as with the *ALLOBJ authority.</paragraph>
<paragraph><location><page_9><loc_22><loc_82><loc_89><loc_91></location>For example, assume that a business has assigned the duty to manage security on IBM i to Theresa. Before release IBM i 7.2, to grant privileges, Theresa had to have the same privileges Theresa was granting to others. Therefore, to grant *USE privileges to the PAYROLL table, Theresa had to have *OBJMGT and *USE authority (or a higher level of authority, such as *ALLOBJ). This requirement allowed Theresa to access the data in the PAYROLL table even though Theresa's job description was only to manage its security.</paragraph>
@ -247,7 +236,7 @@
<paragraph><location><page_12><loc_22><loc_34><loc_66><loc_35></location>- 1. There are user profiles for MGR, JANE, JUDY, and TONY.</paragraph>
<paragraph><location><page_12><loc_22><loc_32><loc_65><loc_33></location>- 2. The user profile JANE specifies a group profile of MGR.</paragraph>
<paragraph><location><page_12><loc_22><loc_28><loc_88><loc_31></location>- 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:</paragraph>
<paragraph><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE')</paragraph>
<paragraph><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY')</paragraph>
<paragraph><location><page_13><loc_22><loc_90><loc_27><loc_91></location>RETURN</paragraph>
<paragraph><location><page_13><loc_22><loc_88><loc_26><loc_89></location>CASE</paragraph>
<paragraph><location><page_13><loc_22><loc_67><loc_85><loc_88></location>WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 9999 || '-' || MONTH ( EMPLOYEES . DATE_OF_BIRTH ) || '-' || DAY (EMPLOYEES.DATE_OF_BIRTH )) ELSE NULL END ENABLE ;</paragraph>
@ -269,12 +258,7 @@
<paragraph><location><page_14><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</paragraph>
<paragraph><location><page_14><loc_22><loc_65><loc_67><loc_66></location>- 1. Run the SQL statements that are shown in Example 3-10.</paragraph>
<subtitle-level-1><location><page_14><loc_22><loc_62><loc_61><loc_63></location>Example 3-10 Activating RCAC on the EMPLOYEES table</subtitle-level-1>
<paragraph><location><page_14><loc_22><loc_60><loc_62><loc_61></location>- /* Active Row Access Control (permissions) */</paragraph>
<paragraph><location><page_14><loc_22><loc_58><loc_58><loc_60></location>- /* Active Column Access Control (masks)</paragraph>
<paragraph><location><page_14><loc_60><loc_58><loc_62><loc_60></location>*/</paragraph>
<paragraph><location><page_14><loc_22><loc_57><loc_48><loc_58></location>ALTER TABLE HR_SCHEMA.EMPLOYEES</paragraph>
<paragraph><location><page_14><loc_22><loc_55><loc_44><loc_56></location>ACTIVATE ROW ACCESS CONTROL</paragraph>
<paragraph><location><page_14><loc_22><loc_54><loc_48><loc_55></location>ACTIVATE COLUMN ACCESS CONTROL;</paragraph>
<paragraph><location><page_14><loc_22><loc_54><loc_62><loc_61></location>- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;</paragraph>
<paragraph><location><page_14><loc_22><loc_48><loc_88><loc_52></location>- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</paragraph>
<caption><location><page_14><loc_11><loc_17><loc_57><loc_18></location>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
<figure>

File diff suppressed because one or more lines are too long

View File

@ -6,9 +6,6 @@ Front cover
## Row and Column Access Control Support in IBM DB2 for i
<!-- image -->
<!-- image -->
## Contents
@ -141,7 +138,8 @@ Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Ro
Chapter 1.
1
<!-- image -->
## Securing and protecting IBM DB2 data
@ -223,27 +221,7 @@ To discover who has authorization to define and manage RCAC, you can use the que
Example 2-1 Query to determine who has authority to define and manage RCAC
SELECT
function_id,
user_name,
usage,
user_type
FROM
function_usage
WHERE
function_id=QIBM_DB_SECADM
ORDER BY
user_name;
SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=QIBM_DB_SECADM ORDER BY user_name;
## 2.2 Separation of duties
@ -350,7 +328,7 @@ Here is an example of using the VERIFY_GROUP_FOR_USER function:
- 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:
VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE')
VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY')
RETURN
@ -387,17 +365,7 @@ Now that you have created the row permission and the two column masks, RCAC must
## Example 3-10 Activating RCAC on the EMPLOYEES table
- /* Active Row Access Control (permissions) */
- /* Active Column Access Control (masks)
*/
ALTER TABLE HR_SCHEMA.EMPLOYEES
ACTIVATE ROW ACCESS CONTROL
ACTIVATE COLUMN ACCESS CONTROL;
- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;
- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .

File diff suppressed because one or more lines are too long

View File

@ -156,39 +156,17 @@
<list_item><location><page_8><loc_9><loc_89><loc_10><loc_90></location>a.</list_item>
<list_item><location><page_8><loc_11><loc_89><loc_82><loc_90></location>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</list_item>
</unordered_list>
<text><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</text>
<text><location><page_8><loc_50><loc_87><loc_70><loc_88></location>Example table from FinTabNet:</text>
<figure>
<location><page_8><loc_8><loc_76><loc_49><loc_87></location>
<caption>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
</figure>
<figure>
<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
</figure>
<table>
<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
</table>
<table>
</figure>
<figure>
<location><page_8><loc_50><loc_64><loc_90><loc_72></location>
<caption>Text is aligned to match original for ease of viewing</caption>
<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
</table>
</figure>
<figure>
<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
<caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
@ -316,7 +294,7 @@
<text><location><page_13><loc_8><loc_83><loc_47><loc_86></location>Aditional images with examples of TableFormer predictions and post-processing can be found below.</text>
<paragraph><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</paragraph>
<figure>
<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
<location><page_13><loc_51><loc_63><loc_91><loc_87></location>
<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
</figure>
<figure>
@ -335,7 +313,10 @@
<location><page_14><loc_52><loc_55><loc_87><loc_89></location>
<caption>Figure 13: Table predictions example on colorful table.</caption>
</figure>
<paragraph><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</paragraph>
<figure>
<location><page_14><loc_52><loc_25><loc_85><loc_31></location>
<caption>Figure 14: Example with multi-line text.</caption>
</figure>
<figure>
<location><page_15><loc_9><loc_69><loc_46><loc_83></location>
</figure>
@ -350,6 +331,9 @@
<caption>Figure 15: Example with triangular table.</caption>
</figure>
<figure>
<location><page_15><loc_53><loc_72><loc_86><loc_85></location>
</figure>
<figure>
<location><page_15><loc_53><loc_41><loc_86><loc_54></location>
</figure>
<figure>

File diff suppressed because one or more lines are too long

View File

@ -223,38 +223,15 @@ Table 4: Results of structure with content retrieved using cell detection on Pub
- a.
- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells
Japanese language (previously unseen by TableFormer):
Example table from FinTabNet:
Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
<!-- image -->
b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
<!-- image -->
| | | 論文ファイル | 論文ファイル | 参考文献 | 参考文献 |
|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
| 出典 | ファイル 数 | 英語 | 日本語 | 英語 | 日本語 |
| Association for Computational Linguistics(ACL2003) | 65 | 65 | 0 | 150 | 0 |
| Computational Linguistics(COLING2002) | 140 | 140 | 0 | 150 | 0 |
| 電気情報通信学会 2003 年総合大会 | 150 | 8 | 142 | 223 | 147 |
| 情報処理学会第 65 回全国大会 (2003) | 177 | 1 | 176 | 150 | 236 |
| 第 17 回人工知能学会全国大会 (2003) | 208 | 5 | 203 | 152 | 244 |
| 自然言語処理研究会第 146 〜 155 回 | 98 | 2 | 96 | 150 | 232 |
| WWW から収集した論文 | 107 | 73 | 34 | 147 | 96 |
| | 945 | 294 | 651 | 1122 | 955 |
Text is aligned to match original for ease of viewing
| | Shares (in millions) | Shares (in millions) | Weighted Average Grant Date Fair Value | Weighted Average Grant Date Fair Value |
|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
| | RS U s | PSUs | RSUs | PSUs |
| Nonvested on Janua ry 1 | 1. 1 | 0.3 | 90.10 $ | $ 91.19 |
| Granted | 0. 5 | 0.1 | 117.44 | 122.41 |
| Vested | (0. 5 ) | (0.1) | 87.08 | 81.14 |
| Canceled or forfeited | (0. 1 ) | - | 102.01 | 92.18 |
| Nonvested on December 31 | 1.0 | 0.3 | 104.85 $ | $ 104.51 |
<!-- image -->
Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
@ -426,12 +403,16 @@ Figure 14: Example with multi-line text.
<!-- image -->
<!-- image -->
Figure 15: Example with triangular table.
<!-- image -->
<!-- image -->
<!-- image -->
Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
<!-- image -->

File diff suppressed because one or more lines are too long

View File

@ -3,17 +3,16 @@
<text><location><page_1><loc_15><loc_77><loc_32><loc_83></location>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</text>
<text><location><page_1><loc_42><loc_77><loc_58><loc_83></location>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</text>
<text><location><page_1><loc_69><loc_77><loc_85><loc_83></location>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</text>
<text><location><page_1><loc_28><loc_70><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</text>
<text><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
<text><location><page_1><loc_28><loc_71><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland</text>
<text><location><page_1><loc_29><loc_70><loc_44><loc_71></location>ahn@zurich.ibm.com</text>
<section_header_level_1><location><page_1><loc_9><loc_67><loc_18><loc_69></location>ABSTRACT</section_header_level_1>
<text><location><page_1><loc_9><loc_33><loc_48><loc_67></location>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</text>
<section_header_level_1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</section_header_level_1>
<text><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</text>
<text><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</text>
<text><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</text>
<text><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</text>
<text><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</text>
<text><location><page_1><loc_9><loc_12><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.</text>
<text><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</text>
<text><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
<figure>
<location><page_1><loc_53><loc_34><loc_90><loc_68></location>
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>

File diff suppressed because one or more lines are too long

View File

@ -6,9 +6,9 @@ Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com
Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com
Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
Ahmed S. Nassar IBM Research Rueschlikon, Switzerland
Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
ahn@zurich.ibm.com
## ABSTRACT
@ -20,14 +20,12 @@ Accurate document layout analysis is a key requirement for highquality PDF docum
Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).
KDD '22, August 14-18, 2022, Washington, DC, USA
© 2022 Copyright held by the owner/author(s).
ACM ISBN 978-1-4503-9385-0/22/08.
KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.
https://doi.org/10.1145/3534678.3539043
Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
Figure 1: Four examples of complex page layouts across different document categories
<!-- image -->

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
<document>
<section_header_level_1><location><page_1><loc_22><loc_82><loc_79><loc_85></location>Optimized Table Tokenization for Table Structure Recognition</section_header_level_1>
<text><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]</text>
<text><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 0002 3723 $^{6960]}$, Ahmed Nassar[0000 0002 9468 $^{0822]}$, Nikolaos Livathinos [0000 0001 8513 $^{3491]}$, Christoph Auer[0000 0001 5761 $^{0422]}$, [0000 0002 8088 0823]</text>
<text><location><page_1><loc_38><loc_74><loc_49><loc_75></location>and Peter Staar</text>
<text><location><page_1><loc_46><loc_72><loc_55><loc_73></location>IBM Research</text>
<text><location><page_1><loc_36><loc_70><loc_64><loc_71></location>{mly,ahn,nli,cau,taa}@zurich.ibm.com</text>

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
## Optimized Table Tokenization for Table Structure Recognition
Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]
Maksym Lysak [0000 0002 3723 $^{6960]}$, Ahmed Nassar[0000 0002 9468 $^{0822]}$, Nikolaos Livathinos [0000 0001 8513 $^{3491]}$, Christoph Auer[0000 0001 5761 $^{0422]}$, [0000 0002 8088 0823]
and Peter Staar

File diff suppressed because one or more lines are too long

View File

@ -5,10 +5,7 @@
</figure>
<section_header_level_1><location><page_1><loc_6><loc_79><loc_96><loc_89></location>Row and Column Access Control Support in IBM DB2 for i</section_header_level_1>
<figure>
<location><page_1><loc_5><loc_11><loc_96><loc_63></location>
</figure>
<figure>
<location><page_1><loc_52><loc_2><loc_95><loc_10></location>
<location><page_1><loc_3><loc_1><loc_96><loc_64></location>
</figure>
<section_header_level_1><location><page_2><loc_11><loc_88><loc_28><loc_91></location>Contents</section_header_level_1>
<table>
@ -109,7 +106,9 @@
<location><page_5><loc_5><loc_70><loc_39><loc_91></location>
</figure>
<text><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</text>
<text><location><page_5><loc_82><loc_84><loc_85><loc_88></location>1</text>
<figure>
<location><page_5><loc_78><loc_82><loc_89><loc_91></location>
</figure>
<section_header_level_1><location><page_5><loc_22><loc_61><loc_89><loc_68></location>Securing and protecting IBM DB2 data</section_header_level_1>
<text><location><page_5><loc_22><loc_46><loc_89><loc_56></location>Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.</text>
<text><location><page_5><loc_22><loc_38><loc_86><loc_44></location>Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement.</text>
@ -165,17 +164,7 @@
</table>
<text><location><page_8><loc_22><loc_40><loc_89><loc_43></location>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</text>
<paragraph><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</paragraph>
<text><location><page_8><loc_22><loc_35><loc_28><loc_36></location>SELECT</text>
<text><location><page_8><loc_30><loc_35><loc_41><loc_36></location>function_id,</text>
<text><location><page_8><loc_27><loc_34><loc_39><loc_35></location>user_name,</text>
<text><location><page_8><loc_28><loc_32><loc_36><loc_33></location>usage,</text>
<text><location><page_8><loc_27><loc_31><loc_39><loc_32></location>user_type</text>
<text><location><page_8><loc_22><loc_29><loc_26><loc_30></location>FROM</text>
<text><location><page_8><loc_29><loc_29><loc_43><loc_30></location>function_usage</text>
<text><location><page_8><loc_22><loc_28><loc_27><loc_29></location>WHERE</text>
<text><location><page_8><loc_29><loc_28><loc_54><loc_29></location>function_id=QIBM_DB_SECADM</text>
<text><location><page_8><loc_22><loc_26><loc_29><loc_27></location>ORDER BY</text>
<text><location><page_8><loc_31><loc_26><loc_39><loc_27></location>user_name;</text>
<table><location><page_8><loc_22><loc_26><loc_89><loc_37></location>SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=QIBM_DB_SECADM ORDER BY user_name;</table>
<section_header_level_1><location><page_8><loc_11><loc_20><loc_41><loc_22></location>2.2 Separation of duties</section_header_level_1>
<text><location><page_8><loc_22><loc_10><loc_89><loc_18></location>Separation of duties helps businesses comply with industry regulations or organizational requirements and simplifies the management of authorities. Separation of duties is commonly used to prevent fraudulent activities or errors by a single person. It provides the ability for administrative functions to be divided across individuals without overlapping responsibilities, so that one user does not possess unlimited authority, such as with the *ALLOBJ authority.</text>
<text><location><page_9><loc_22><loc_82><loc_89><loc_91></location>For example, assume that a business has assigned the duty to manage security on IBM i to Theresa. Before release IBM i 7.2, to grant privileges, Theresa had to have the same privileges Theresa was granting to others. Therefore, to grant *USE privileges to the PAYROLL table, Theresa had to have *OBJMGT and *USE authority (or a higher level of authority, such as *ALLOBJ). This requirement allowed Theresa to access the data in the PAYROLL table even though Theresa's job description was only to manage its security.</text>
@ -255,7 +244,7 @@
<list_item><location><page_12><loc_22><loc_32><loc_65><loc_33></location>2. The user profile JANE specifies a group profile of MGR.</list_item>
<list_item><location><page_12><loc_22><loc_28><loc_88><loc_31></location>3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:</list_item>
</unordered_list>
<code><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE')</code>
<code><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY')</code>
<text><location><page_13><loc_22><loc_90><loc_27><loc_91></location>RETURN</text>
<text><location><page_13><loc_22><loc_88><loc_26><loc_89></location>CASE</text>
<code><location><page_13><loc_22><loc_67><loc_85><loc_88></location>WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 9999 || '-' || MONTH ( EMPLOYEES . DATE_OF_BIRTH ) || '-' || DAY (EMPLOYEES.DATE_OF_BIRTH )) ELSE NULL END ENABLE ;</code>
@ -283,14 +272,7 @@
</unordered_list>
<section_header_level_1><location><page_14><loc_22><loc_62><loc_61><loc_63></location>Example 3-10 Activating RCAC on the EMPLOYEES table</section_header_level_1>
<unordered_list>
<list_item><location><page_14><loc_22><loc_60><loc_62><loc_61></location>/* Active Row Access Control (permissions) */</list_item>
<list_item><location><page_14><loc_22><loc_58><loc_58><loc_60></location>/* Active Column Access Control (masks)</list_item>
</unordered_list>
<text><location><page_14><loc_60><loc_58><loc_62><loc_60></location>*/</text>
<text><location><page_14><loc_22><loc_57><loc_48><loc_58></location>ALTER TABLE HR_SCHEMA.EMPLOYEES</text>
<text><location><page_14><loc_22><loc_55><loc_44><loc_56></location>ACTIVATE ROW ACCESS CONTROL</text>
<text><location><page_14><loc_22><loc_54><loc_48><loc_55></location>ACTIVATE COLUMN ACCESS CONTROL;</text>
<unordered_list>
<list_item><location><page_14><loc_22><loc_54><loc_62><loc_61></location>/* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;</list_item>
<list_item><location><page_14><loc_22><loc_48><loc_88><loc_52></location>2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</list_item>
</unordered_list>
<figure>

File diff suppressed because one or more lines are too long

View File

@ -6,8 +6,6 @@ Front cover
<!-- image -->
<!-- image -->
## Contents
| Notices | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vii |
@ -120,7 +118,7 @@ Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Ro
Chapter 1.
1
<!-- image -->
## Securing and protecting IBM DB2 data
@ -198,27 +196,7 @@ To discover who has authorization to define and manage RCAC, you can use the que
Example 2-1 Query to determine who has authority to define and manage RCAC
SELECT
function\_id,
user\_name,
usage,
user\_type
FROM
function\_usage
WHERE
function\_id=QIBM\_DB\_SECADM
ORDER BY
user\_name;
SELECT function\_id, user\_name, usage, user\_type FROM function\_usage WHERE function\_id=QIBM\_DB\_SECADM ORDER BY user\_name;
## 2.2 Separation of duties
@ -318,7 +296,7 @@ Here is an example of using the VERIFY\_GROUP\_FOR\_USER function:
- 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:
```
VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'MGR') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JUDY', 'TONY') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR', 'STEVE')
VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'MGR') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JUDY', 'TONY')
```
RETURN
@ -356,17 +334,7 @@ Now that you have created the row permission and the two column masks, RCAC must
## Example 3-10 Activating RCAC on the EMPLOYEES table
- /* Active Row Access Control (permissions) */
- /* Active Column Access Control (masks)
*/
ALTER TABLE HR\_SCHEMA.EMPLOYEES
ACTIVATE ROW ACCESS CONTROL
ACTIVATE COLUMN ACCESS CONTROL;
- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR\_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;
- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR\_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .
Figure 3-11 Selecting the EMPLOYEES table from System i Navigator

File diff suppressed because one or more lines are too long

View File

@ -8,8 +8,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE_V1 = False
GENERATE_V2 = False
GENERATE_V1 = True
GENERATE_V2 = True
def get_pdf_paths():

View File

@ -18,8 +18,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE_V1 = False
GENERATE_V2 = False
GENERATE_V1 = True
GENERATE_V2 = True
def get_pdf_paths():