Big refactoring for legacy_document support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-14 16:36:11 +02:00
parent 08ab628e75
commit 497ddb34a8
54 changed files with 1198 additions and 590 deletions

View File

@ -146,8 +146,8 @@ from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
result = converter.convert_single(source)
print(result.output.export_to_markdown()) # output: "## Docling Technical Report[...]"
print(result.output.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
```
### Convert a batch of documents
@ -289,7 +289,7 @@ You can perform a hierarchy-aware chunking of a Docling document as follows:
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2206.01062").legacy_output
doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2206.01062").legacy_document
chunks = list(HierarchicalChunker().chunk(doc))
print(chunks[0])
# ChunkWithMetadata(

View File

@ -1,12 +1,13 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from typing import TYPE_CHECKING, Set, Union
# from docling.datamodel.document import InputDocument
from docling_core.types.experimental import DoclingDocument
if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
class AbstractDocumentBackend(ABC):
@ -34,7 +35,7 @@ class AbstractDocumentBackend(ABC):
@classmethod
@abstractmethod
def supported_formats(cls) -> Set[InputFormat]:
def supported_formats(cls) -> Set["InputFormat"]:
pass

View File

@ -102,7 +102,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
prov = ProvenanceItem(
page_no=slide_ind, charspan=[0, len(text)], bbox=shape_bbox
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
)
return prov
@ -292,7 +292,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
)
size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind, size=size)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
# Loop through each shape in the slide

View File

@ -14,25 +14,25 @@ from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC):
@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
def get_text_in_rect(self, bbox: BoundingBox) -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
def get_text_cells(self) -> Iterable[Cell]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "Size":
def get_size(self) -> Size:
pass
@abstractmethod
@ -46,7 +46,7 @@ class PdfPageBackend(ABC):
class PdfDocumentBackend(PaginatedDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
if self.input_format is not InputFormat.PDF:

View File

@ -87,28 +87,28 @@ def export_documents(
fname = output_dir / f"{doc_filename}.json"
with fname.open("w") as fp:
_log.info(f"writing JSON output to {fname}")
fp.write(json.dumps(conv_res.render_as_dict()))
fp.write(json.dumps(conv_res.document.export_to_dict()))
# Export Text format:
if export_txt:
fname = output_dir / f"{doc_filename}.txt"
with fname.open("w") as fp:
_log.info(f"writing Text output to {fname}")
fp.write(conv_res.render_as_text())
fp.write(conv_res.document.export_to_markdown(strict_text=True))
# Export Markdown format:
if export_md:
fname = output_dir / f"{doc_filename}.md"
with fname.open("w") as fp:
_log.info(f"writing Markdown output to {fname}")
fp.write(conv_res.render_as_markdown())
fp.write(conv_res.document.export_to_markdown())
# Export Document Tags format:
if export_doctags:
fname = output_dir / f"{doc_filename}.doctags"
with fname.open("w") as fp:
_log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.render_as_doctags())
fp.write(conv_res.document.export_to_doctags())
else:
_log.warning(f"Document {conv_res.input.file} failed to convert.")

View File

@ -1,6 +1,6 @@
from enum import Enum, auto
from io import BytesIO
from typing import Dict, List, Optional, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.document import PictureData, TableCell
@ -8,6 +8,9 @@ from docling_core.types.experimental.labels import DocItemLabel
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend
class ConversionStatus(str, Enum):
PENDING = auto()
@ -27,10 +30,13 @@ class InputFormat(str, Enum):
FormatToMimeType = {
InputFormat.DOCX: {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
},
InputFormat.PPTX: {
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
},
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
InputFormat.IMAGE: {

View File

@ -3,7 +3,7 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import Dict, Iterable, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
import filetype
from docling_core.types import BaseText
@ -13,12 +13,18 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure, TableCell
from docling_core.types.doc.base import Figure, GlmTableCell, TableCell
from docling_core.types.experimental import (
DescriptionItem,
DocItem,
DocItemLabel,
DoclingDocument,
PictureItem,
SectionHeaderItem,
TableItem,
TextItem,
)
from docling_core.types.experimental.document import ListItem
from docling_core.utils.file import resolve_file_source
from pydantic import BaseModel
from typing_extensions import deprecated
@ -40,6 +46,9 @@ from docling.datamodel.base_models import (
from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING:
from docling.document_converter import FormatOption
_log = logging.getLogger(__name__)
layout_label_to_ds_type = {
@ -58,6 +67,7 @@ layout_label_to_ds_type = {
DocItemLabel.CODE: "paragraph",
DocItemLabel.PICTURE: "figure",
DocItemLabel.TEXT: "paragraph",
DocItemLabel.PARAGRAPH: "paragraph",
}
_EMPTY_LEGACY_DOC = DsDocument(
@ -166,20 +176,42 @@ class ConversionResult(BaseModel):
pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit()
legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC
output: DoclingDocument = _EMPTY_DOCLING_DOC
document: DoclingDocument = _EMPTY_DOCLING_DOC
@property
@deprecated("Use document instead.")
def legacy_document(self):
reverse_label_mapping = {
DocItemLabel.CAPTION.value: "Caption",
DocItemLabel.FOOTNOTE.value: "Footnote",
DocItemLabel.FORMULA.value: "Formula",
DocItemLabel.LIST_ITEM.value: "List-item",
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
DocItemLabel.PAGE_HEADER.value: "Page-header",
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER.value: "Section-header",
DocItemLabel.TABLE.value: "Table",
DocItemLabel.TEXT.value: "Text",
DocItemLabel.TITLE.value: "Title",
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
DocItemLabel.CODE.value: "Code",
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
DocItemLabel.FORM.value: "Form",
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
DocItemLabel.PARAGRAPH.value: "paragraph",
}
def _to_legacy_document(self) -> DsDocument:
title = ""
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(
hash=create_hash(self.input.document_hash + ":" + str(p.page_no)),
page=p.page_no + 1,
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
page=p.page_no,
model="default",
)
for p in self.pages
for p in self.document.pages.values()
]
file_info = DsFileInfoObject(
@ -192,39 +224,62 @@ class ConversionResult(BaseModel):
main_text = []
tables = []
figures = []
equations = []
footnotes = []
page_headers = []
page_footers = []
page_no_to_page = {p.page_no: p for p in self.pages}
embedded_captions = set()
for ix, (item, level) in enumerate(
self.document.iterate_items(self.document.body)
):
for element in self.assembled.elements:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
caption = item.caption_text(self.document)
if caption:
embedded_captions.add(caption)
if isinstance(element, TextElement):
main_text.append(
BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
for item, level in self.document.iterate_items():
if isinstance(item, DocItem):
item_type = item.label
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
if isinstance(item, ListItem) and item.marker:
text = f"{item.marker} {item.text}"
else:
text = item.text
# Can be empty.
prov = [
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, len(element.text)],
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(item.text)],
)
],
for p in item.prov
]
main_text.append(
BaseText(
text=text,
obj_type=layout_label_to_ds_type.get(item.label),
name=reverse_label_mapping[item.label],
prov=prov,
)
)
elif isinstance(element, Table):
# skip captions of they are embedded in the actual
# floating object
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
continue
elif isinstance(item, TableItem) and item.data:
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
@ -238,20 +293,20 @@ class ConversionResult(BaseModel):
spans=[[i, j]],
obj_type="body",
)
for j in range(element.num_cols)
for j in range(item.data.num_cols)
]
for i in range(element.num_rows)
for i in range(item.data.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in element.table_cells:
for cell in item.data.table_cells:
for i in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
min(cell.start_row_offset_idx, item.data.num_rows),
min(cell.end_row_offset_idx, item.data.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, element.num_cols),
min(cell.end_col_offset_idx, element.num_cols),
min(cell.start_col_offset_idx, item.data.num_cols),
min(cell.end_col_offset_idx, item.data.num_cols),
):
celltype = "body"
if cell.column_header:
@ -263,74 +318,105 @@ class ConversionResult(BaseModel):
def make_spans(cell):
for rspan in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
min(
cell.start_row_offset_idx,
item.data.num_rows,
),
min(
cell.end_row_offset_idx, item.data.num_rows
),
):
for cspan in range(
min(
cell.start_col_offset_idx, element.num_cols
cell.start_col_offset_idx,
item.data.num_cols,
),
min(
cell.end_col_offset_idx,
item.data.num_cols,
),
min(cell.end_col_offset_idx, element.num_cols),
):
yield [rspan, cspan]
spans = list(make_spans(cell))
table_data[i][j] = TableCell(
table_data[i][j] = GlmTableCell(
text=cell.text,
bbox=cell.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple(),
# col=j,
# row=i,
bbox=(
cell.bbox.as_tuple()
if cell.bbox is not None
else None
), # check if this is bottom-left
spans=spans,
obj_type=celltype,
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
col=j,
row=i,
row_header=cell.row_header,
row_section=cell.row_section,
col_header=cell.column_header,
row_span=[
cell.start_row_offset_idx,
cell.end_row_offset_idx,
],
col_span=[
cell.start_col_offset_idx,
cell.end_col_offset_idx,
],
)
# Compute the caption
caption = item.caption_text(self.document)
tables.append(
DsSchemaTable(
num_cols=element.num_cols,
num_rows=element.num_rows,
obj_type=layout_label_to_ds_type.get(element.label),
text=caption,
num_cols=item.data.num_cols,
num_rows=item.data.num_rows,
obj_type=layout_label_to_ds_type.get(item.label),
data=table_data,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, 0],
)
for p in item.prov
],
)
)
elif isinstance(element, FigureElement):
elif isinstance(item, PictureItem):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
# Compute the caption
caption = item.caption_text(self.document)
figures.append(
Figure(
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(caption)],
)
for p in item.prov
],
obj_type=layout_label_to_ds_type.get(element.label),
obj_type=layout_label_to_ds_type.get(item.label),
text=caption,
# data=[[]],
)
)
page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in self.pages
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
for p in self.document.pages.values()
]
ds_doc = DsDocument(
@ -338,6 +424,10 @@ class ConversionResult(BaseModel):
description=desc,
file_info=file_info,
main_text=main_text,
equations=equations,
footnotes=footnotes,
page_headers=page_headers,
page_footers=page_footers,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
@ -345,115 +435,6 @@ class ConversionResult(BaseModel):
return ds_doc
@deprecated("Use output.export_to_dict() instead.")
def render_as_dict(self):
return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
@deprecated("Use output.export_to_markdown() instead.")
def render_as_markdown(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure",
],
strict_text: bool = False,
image_placeholder: str = "<!-- image -->",
) -> str:
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as markdown. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
image_placeholder=image_placeholder,
)
@deprecated("Use output.export_to_text() instead.")
def render_as_text(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
],
) -> str:
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as text. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=True,
)
@deprecated("Use output.export_to_document_tokens() instead.")
def render_as_doctags(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure",
],
xsize: int = 100,
ysize: int = 100,
add_location: bool = True,
add_content: bool = True,
add_page_index: bool = True,
# table specific flags
add_table_cell_location: bool = False,
add_table_cell_label: bool = True,
add_table_cell_text: bool = True,
) -> str:
if self.legacy_output is None:
raise RuntimeError(
"No legacy output was produced, can not export as doctags. "
"Please use output.export_to_markdown() instead."
)
else:
return self.legacy_output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
xsize=xsize,
ysize=ysize,
add_location=add_location,
add_content=add_content,
add_page_index=add_page_index,
# table specific flags
add_table_cell_location=add_table_cell_location,
add_table_cell_label=add_table_cell_label,
add_table_cell_text=add_table_cell_text,
)
def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
):

View File

@ -98,14 +98,14 @@ class DocumentConverter:
self.format_to_options = format_options
if self.allowed_formats is None:
if self.format_to_options is not None:
self.allowed_formats = self.format_to_options.keys()
else:
# if self.format_to_options is not None:
# self.allowed_formats = self.format_to_options.keys()
# else:
self.allowed_formats = [e for e in InputFormat] # all formats
if self.format_to_options is None:
self.format_to_options = _format_to_default_options
else:
for f in self.allowed_formats:
if f not in self.format_to_options.keys():
_log.info(f"Requested format {f} will use default options.")

View File

@ -9,48 +9,229 @@ from deepsearch_glm.utils.doc_utils import (
)
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types import Document as DsLegacyDocument
from docling_core.types import Ref
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure, TableCell
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.experimental.document import DoclingDocument
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster
from docling.datamodel.document import ConversionResult
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.utils.utils import create_hash
class GlmOptions(BaseModel):
model_config = ConfigDict(protected_namespaces=())
create_legacy_output: bool = True
model_names: str = "" # e.g. "language;term;reference"
class GlmModel:
def __init__(self, options: GlmOptions):
self.options = options
self.create_legacy_output = self.options.create_legacy_output
load_pretrained_nlp_models()
self.model = init_nlp_model(model_names=self.options.model_names)
def _to_legacy_document(self, conv_res) -> DsDocument:
title = ""
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
page=p.page_no + 1,
model="default",
)
for p in conv_res.pages
]
file_info = DsFileInfoObject(
filename=conv_res.input.file.name,
document_hash=conv_res.input.document_hash,
num_pages=conv_res.input.page_count,
page_hashes=page_hashes,
)
main_text = []
tables = []
figures = []
page_no_to_page = {p.page_no: p for p in conv_res.pages}
for element in conv_res.assembled.elements:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)
if isinstance(element, TextElement):
main_text.append(
BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, len(element.text)],
)
],
)
)
elif isinstance(element, Table):
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
)
for j in range(element.num_cols)
]
for i in range(element.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in element.table_cells:
for i in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, element.num_cols),
min(cell.end_col_offset_idx, element.num_cols),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
elif cell.row_section:
celltype = "row_section"
def make_spans(cell):
for rspan in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for cspan in range(
min(
cell.start_col_offset_idx, element.num_cols
),
min(cell.end_col_offset_idx, element.num_cols),
):
yield [rspan, cspan]
spans = list(make_spans(cell))
table_data[i][j] = TableCell(
text=cell.text,
bbox=cell.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple(),
# col=j,
# row=i,
spans=spans,
obj_type=celltype,
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
)
tables.append(
DsSchemaTable(
num_cols=element.num_cols,
num_rows=element.num_rows,
obj_type=layout_label_to_ds_type.get(element.label),
data=table_data,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
)
],
)
)
elif isinstance(element, FigureElement):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
figures.append(
Figure(
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
)
],
obj_type=layout_label_to_ds_type.get(element.label),
# data=[[]],
)
)
page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in conv_res.pages
]
ds_doc = DsDocument(
name=title,
description=desc,
file_info=file_info,
main_text=main_text,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
)
return ds_doc
def __call__(
self, conv_res: ConversionResult
) -> Tuple[DsLegacyDocument, DoclingDocument]:
ds_doc = conv_res._to_legacy_document()
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict)
ds_doc_dict = to_legacy_document_format(
glm_doc, ds_doc_dict, update_name_label=True
)
# ds_doc_dict = to_legacy_document_format(
# glm_doc, ds_doc_dict, update_name_label=True
# )
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
legacy_doc: DsLegacyDocument = None
# legacy_doc: DsLegacyDocument = None
if self.create_legacy_output:
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
# if self.create_legacy_output:
# legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no):
@ -100,4 +281,4 @@ class GlmModel:
# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)
return (docling_doc, legacy_doc)
return docling_doc

View File

@ -79,13 +79,13 @@ class BasePipeline(ABC):
for model in self.enrichment_pipe:
for element_batch in chunkify(
_filter_elements(conv_res.output, model),
_filter_elements(conv_res.document, model),
settings.perf.elements_batch_size,
):
# TODO: currently we assume the element itself is modified, because
# we don't have an interface to save the element back to the document
for element in model(
doc=conv_res.output, element_batch=element_batch
doc=conv_res.document, element_batch=element_batch
): # Must exhaust!
pass

View File

@ -39,7 +39,7 @@ class SimplePipeline(BasePipeline):
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
# a DoclingDocument straight.
conv_res.output = in_doc._backend.convert()
conv_res.document = in_doc._backend.convert()
return conv_res
def _determine_status(

View File

@ -45,11 +45,7 @@ class StandardPdfPipeline(PaginatedPipeline):
else:
self.artifacts_path = Path(pipeline_options.artifacts_path)
self.glm_model = GlmModel(
options=GlmOptions(
create_legacy_output=pipeline_options.create_legacy_output
)
)
self.glm_model = GlmModel(options=GlmOptions())
if (ocr_model := self.get_ocr_model()) is None:
raise RuntimeError(
@ -152,7 +148,7 @@ class StandardPdfPipeline(PaginatedPipeline):
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
conv_res.document = self.glm_model(conv_res)
return conv_res

View File

@ -41,7 +41,7 @@ def generate_multimodal_pages(
end_ix = 0
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
doc = doc_result.legacy_output
doc = doc_result.legacy_document
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
segments = []

View File

@ -31,37 +31,12 @@ def export_documents(
success_count += 1
doc_filename = conv_res.input.file.stem
if USE_LEGACY:
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
if USE_V2:
# Export Docling document format to JSON (experimental):
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(
json.dumps(
conv_res.output.model_dump(
conv_res.document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
)
@ -71,7 +46,7 @@ def export_documents(
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
fp.write(
yaml.safe_dump(
conv_res.output.model_dump(
conv_res.document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
)
@ -79,15 +54,42 @@ def export_documents(
# Export Docling document format to doctags (experimental):
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
fp.write(conv_res.output.export_to_document_tokens())
fp.write(conv_res.document.export_to_document_tokens())
# Export Docling document format to markdown (experimental):
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.output.export_to_markdown())
fp.write(conv_res.document.export_to_markdown())
# Export Docling document format to text (experimental):
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.output.export_to_markdown(strict_text=True))
fp.write(conv_res.document.export_to_markdown(strict_text=True))
if USE_LEGACY:
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(
conv_res.legacy_document.export_to_markdown(strict_text=True)
)
# Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_doctags())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(

View File

@ -119,19 +119,19 @@ def main():
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(conv_result.output.export_to_dict()))
fp.write(json.dumps(conv_result.document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_result.output.export_to_text())
fp.write(conv_result.document.export_to_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_result.output.export_to_markdown())
fp.write(conv_result.document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
fp.write(conv_result.output.export_to_document_tokens())
fp.write(conv_result.document.export_to_document_tokens())
if __name__ == "__main__":

View File

@ -26,7 +26,7 @@ def main():
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.legacy_output.tables):
for table_ix, table in enumerate(conv_res.document.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())

View File

@ -3,6 +3,8 @@ from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
result = converter.convert(source)
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
print(
result.document.export_to_markdown()
) # output: ## Docling Technical Report [...]"
# if the legacy output is needed, use this version
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
# print(result.legacy_output.export_to_markdown()) # output: ## Docling Technical Report [...]"

View File

@ -61,7 +61,7 @@ for res in conv_results:
# print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental):
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.output.export_to_markdown())
fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.output.export_to_dict()))
fp.write(json.dumps(res.document.export_to_dict()))

View File

@ -6,7 +6,7 @@
<subtitle-level-1><location><page_1><loc_52><loc_71><loc_67><loc_73></location>a. Picture of a table:</subtitle-level-1>
<subtitle-level-1><location><page_1><loc_8><loc_30><loc_21><loc_32></location>1. Introduction</subtitle-level-1>
<paragraph><location><page_1><loc_8><loc_10><loc_47><loc_29></location>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</paragraph>
<paragraph><location><page_1><loc_8><loc_35><loc_47><loc_70></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</paragraph>
<caption><location><page_1><loc_8><loc_35><loc_47><loc_70></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
<table>
<location><page_1><loc_52><loc_62><loc_88><loc_71></location>
<caption>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
@ -118,7 +118,7 @@
<paragraph><location><page_7><loc_8><loc_73><loc_47><loc_77></location>where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .</paragraph>
<subtitle-level-1><location><page_7><loc_8><loc_70><loc_28><loc_72></location>5.4. Quantitative Analysis</subtitle-level-1>
<paragraph><location><page_7><loc_8><loc_50><loc_47><loc_69></location>Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.</paragraph>
<paragraph><location><page_7><loc_8><loc_23><loc_47><loc_25></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</paragraph>
<caption><location><page_7><loc_8><loc_23><loc_47><loc_25></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
<table>
<location><page_7><loc_9><loc_26><loc_46><loc_48></location>
<caption>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>

File diff suppressed because one or more lines are too long

View File

@ -14,7 +14,6 @@ The occurrence of tables in documents is ubiquitous. They often summarise quanti
Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
b. Red-annotation of bounding boxes, Blue-predictions by TableFormer
@ -27,6 +26,7 @@ c.
Structure predicted by TableFormer:
Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.
| 0 | 1 | 1 | 2 1 | 2 1 | |
|-----|-----|-----|-------|-------|----|
| 3 | 4 | 5 3 | 6 | 7 | |
@ -98,6 +98,7 @@ Motivated by those observations we aimed at generating a synthetic table dataset
In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third
Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.
| | Tags | Bbox | Size | Format |
|--------------------|--------|--------|--------|----------|
| PubTabNet | 3 | 3 | 509k | PNG |
@ -181,7 +182,6 @@ Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across d
Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
| Model | Dataset | Simple | TEDS Complex | All |
|-------------|-----------|----------|----------------|-------|
| EDD | PTN | 91.1 | 88.7 | 89.9 |
@ -202,6 +202,7 @@ Cell Detection. Like any object detector, our Cell BBox Detector provides boundi
our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.
Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.
| Model | Dataset | mAP | mAP (PP) |
|-------------|-------------|-------|------------|
| EDD+BBox | PubTabNet | 79.2 | 82.7 |
@ -211,6 +212,7 @@ Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Po
Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.
Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.
| Model | Simple | TEDS Complex | All |
|-------------|----------|----------------|-------|
| Tabula | 78 | 57.8 | 67.9 |
@ -234,6 +236,7 @@ b.
Structure predicted by TableFormer, with superimposed matched PDF cell text:
| | | 論文ファイル | 論文ファイル | 参考文献 | 参考文献 |
|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
| 出典 | ファイル 数 | 英語 | 日本語 | 英語 | 日本語 |
@ -247,6 +250,7 @@ Structure predicted by TableFormer, with superimposed matched PDF cell text:
| | 945 | 294 | 651 | 1122 | 955 |
Text is aligned to match original for ease of viewing
| | Shares (in millions) | Shares (in millions) | Weighted Average Grant Date Fair Value | Weighted Average Grant Date Fair Value |
|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
| | RS U s | PSUs | RSUs | PSUs |

View File

@ -111,7 +111,7 @@
</figure>
<paragraph><location><page_5><loc_52><loc_31><loc_91><loc_34></location>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</paragraph>
<paragraph><location><page_5><loc_52><loc_10><loc_91><loc_31></location>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</paragraph>
<paragraph><location><page_6><loc_9><loc_77><loc_48><loc_89></location>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</paragraph>
<caption><location><page_6><loc_9><loc_77><loc_48><loc_89></location>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
<table>
<location><page_6><loc_10><loc_56><loc_47><loc_75></location>
<caption>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
@ -143,7 +143,7 @@
<subtitle-level-1><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</subtitle-level-1>
<paragraph><location><page_6><loc_52><loc_11><loc_91><loc_35></location>In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.</paragraph>
<paragraph><location><page_7><loc_9><loc_84><loc_48><loc_89></location>Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.</paragraph>
<paragraph><location><page_7><loc_52><loc_84><loc_91><loc_89></location>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</paragraph>
<caption><location><page_7><loc_52><loc_84><loc_91><loc_89></location>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
<table>
<location><page_7><loc_13><loc_63><loc_44><loc_81></location>
<caption>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
@ -187,7 +187,7 @@
<paragraph><location><page_7><loc_52><loc_25><loc_91><loc_44></location>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</paragraph>
<subtitle-level-1><location><page_7><loc_52><loc_22><loc_68><loc_23></location>Dataset Comparison</subtitle-level-1>
<paragraph><location><page_7><loc_52><loc_11><loc_91><loc_21></location>Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,</paragraph>
<paragraph><location><page_8><loc_9><loc_81><loc_48><loc_89></location>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</paragraph>
<caption><location><page_8><loc_9><loc_81><loc_48><loc_89></location>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>
<table>
<location><page_8><loc_12><loc_57><loc_45><loc_78></location>
<caption>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>

File diff suppressed because one or more lines are too long

View File

@ -109,6 +109,7 @@ Despite being cost-intense and far less scalable than automation, human annotati
The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,
Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.
| | | % of Total | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |
|----------------|---------|--------------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|
| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |
@ -169,7 +170,6 @@ Phase 4: Production annotation. The previously selected 80K pages were annotated
Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
| | human | MRCNN | MRCNN | FRCNN | YOLO |
|----------------|---------|---------|---------|---------|--------|
| | human | R50 | R101 | R101 | v5x6 |
@ -207,7 +207,6 @@ Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.
Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.
| Class-count | 11 | 6 | 5 | 4 |
|----------------|------|---------|---------|---------|
| Caption | 68 | Text | Text | Text |
@ -232,6 +231,7 @@ One of the fundamental questions related to any dataset is if it is "large enoug
The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of
| Class-count | 11 | 11 | 5 | 5 |
|----------------|------|------|-----|------|
| Split | Doc | Page | Doc | Page |
@ -260,7 +260,6 @@ Throughout this paper, we claim that DocLayNet's wider variety of document layou
Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
| | | Testing on | Testing on | Testing on |
|-----------------|------------|--------------|--------------|--------------|
| Training on | labels | PLN | DB | DLN |

File diff suppressed because one or more lines are too long

View File

@ -5,6 +5,7 @@ order to compute the TED score. Inference timing results for all experiments wer
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |

File diff suppressed because one or more lines are too long

View File

@ -125,6 +125,7 @@ order to compute the TED score. Inference timing results for all experiments wer
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
@ -141,6 +142,7 @@ We picked the model parameter configuration that produced the best prediction qu
Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.
Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).
| | Language | TEDs | TEDs | TEDs | mAP(0.75) | Inference time (secs) |
|--------------|------------|--------|---------|--------|-------------|-------------------------|
| | Language | simple | complex | all | mAP(0.75) | Inference time (secs) |

View File

@ -487,7 +487,7 @@
<caption><location><page_31><loc_22><loc_47><loc_56><loc_48></location>Figure 3-1 CREATE PERMISSION SQL statement</caption>
<figure>
<location><page_31><loc_22><loc_48><loc_89><loc_86></location>
<caption>Figure 3-1 CREATE PERMISSION SQL statement</caption>
<caption>The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.Figure 3-1 CREATE PERMISSION SQL statement</caption>
</figure>
<subtitle-level-1><location><page_31><loc_22><loc_43><loc_35><loc_45></location>Column mask</subtitle-level-1>
<paragraph><location><page_31><loc_22><loc_37><loc_89><loc_43></location>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</paragraph>

File diff suppressed because one or more lines are too long

View File

@ -35,6 +35,7 @@ Note to U.S. Government Users Restricted Rights -- Use, duplication or disclosur
## Contents
| Notices | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vii |
|------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
| Trademarks | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . viii |
@ -81,6 +82,7 @@ Note to U.S. Government Users Restricted Rights -- Use, duplication or disclosur
| 3.6.8 Demonstrating data access with a view and RCAC . . . . . . . . . . . . . . . . . . . . . . . | 32 |
| Chapter 4. Implementing Row and Column Access Control: Banking example . . . . . | 37 |
|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 4.1 Business requirements for the RCAC banking scenario . . . . . . . . . . . . . . . . . . . . . . . . | 38 |
@ -131,6 +133,7 @@ Note to U.S. Government Users Restricted Rights -- Use, duplication or disclosur
| Chapter 7. Row and Column Access Control management . . . . . . . . . . . . . . . . . . . . | Chapter 7. Row and Column Access Control management . . . . . . . . . . . . . . . . . . . . |
| 7.1 Managing row permissions and column masks. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | 114 |
|---------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------|
| 7.1.1 Source management. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | 114 |
@ -601,6 +604,7 @@ CHGFCNUSG FCNID(QIBM_DB_SECADM) USER(HBEDOYA) USAGE(*ALLOWED)
The FUNCTION_USAGE view contains function usage configuration details. Table 2-1 describes the columns in the FUNCTION_USAGE view.
Table 2-1 FUNCTION_USAGE view
| Column name | Data type | Description |
|---------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| FUNCTION_ID | VARCHAR(30) | ID of the function. |
@ -631,6 +635,7 @@ A preferred practice is that the RCAC administrator has the QIBM_DB_SECADM funct
Table 2-2 shows a comparison of the different function usage IDs and *JOBCTL authority to the different CL commands and DB2 for i tools.
Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority
| User action | *JOBCTL | QIBM_DB_SECADM | QIBM_DB_SQLADM | QIBM_DB_SYSMON | No Authority |
|--------------------------------------------------------------------------------|-----------|------------------|------------------|------------------|----------------|
| SET CURRENT DEGREE (SQL statement) | X | | X | | |
@ -647,6 +652,7 @@ Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority
| CHANGE PLAN CACHE SIZE procedure (currently does not check authority) | X | | X | | |
| User action | *JOBCTL | QIBM_DB_SECADM | QIBM_DB_SQLADM | QIBM_DB_SYSMON | No Authority |
|--------------------------------------------------------------|-----------|------------------|------------------|------------------|----------------|
| START PLAN CACHE EVENT MONITOR procedure | X | | X | | |
@ -725,6 +731,8 @@ A row permission is a database object that manifests a row access control rule f
The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.
Figure 3-1 CREATE PERMISSION SQL statement
The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.Figure 3-1 CREATE PERMISSION SQL statement
<!-- image -->
## Column mask
@ -783,6 +791,7 @@ In addition to these four special registers, any of the DB2 special registers ca
Table 3-1 summarizes these special registers and their values.
Table 3-1 Special registers and their corresponding values
| Special register | Corresponding value |
|----------------------|---------------------------------------------------------------------------------------------------------------------------------------|
| USER or SESSION_USER | The effective user of the thread excluding adopted authority. |
@ -813,6 +822,7 @@ IBM DB2 for i supports nine different built-in global variables that are read on
Table 3-2 lists the nine built-in global variables.
Table 3-2 Built-in global variables
| Global variable | Type | Description |
|-----------------------|--------------|----------------------------------------------------------------|
| CLIENT_HOST | VARCHAR(255) | Host name of the current client as returned by the system |
@ -1242,6 +1252,7 @@ Figure 4-2 Rules for row and column access
The chart that is shown in Figure 4-3 shows the column access that is allowed by group and lists the column masks by table.
Figure 4-3 Column masks
| | | CUSTOMERS | ACCOUNTS |
|----------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------|
| SECURITY | No Rows | CUSTOMER_DRIVERS_LICENSE_NUMBER CUSTOMER_EMAIL CUSTOMER_LOGIN_ID CUSTOMER_SECURITY_QUESTION CUSTOMER_SECURITY_QUESTION_ANSWER CUSTOMER_TAX_ID | ACCOUNT_NUMBER |
@ -1968,6 +1979,7 @@ CREDIT_CARD_NUMBER;
## With RCAC Masking
| CREDIT CARD NUMBER _ _ | TOTAL |
|--------------------------|---------------|
| 3785 0000 0000 1234 | 233.50 |
@ -1982,6 +1994,7 @@ CREDIT_CARD_NUMBER;
| 6011 9999 9999 0001 | 10.00 |
Figure 6-1 Timing of column masking
| CREDIT CARD NUMBER _ _ | TOTAL |
|---------------------------|---------------|
| **** **** **** 1234 | 233.50 |

View File

@ -6,10 +6,11 @@
<section_header><location><page_1><loc_52><loc_71><loc_67><loc_73></location>a. Picture of a table:</section_header>
<section_header><location><page_1><loc_8><loc_30><loc_21><loc_32></location>1. Introduction</section_header>
<text><location><page_1><loc_8><loc_10><loc_47><loc_29></location>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</text>
<caption><location><page_1><loc_52><loc_62><loc_88><loc_71></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
<caption><location><page_1><loc_8><loc_35><loc_47><loc_70></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
<table>
<location><page_1><loc_52><loc_62><loc_88><loc_71></location>
<row_0><col_0><body>3</col_0><col_1><body>1</col_1></row_0>
<caption>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
<row_0><col_0><col_header>3</col_0><col_1><col_header>1</col_1></row_0>
</table>
<text><location><page_1><loc_52><loc_58><loc_79><loc_60></location>b. Red-annotation of bounding boxes, Blue-predictions by TableFormer</text>
<figure>
@ -17,10 +18,11 @@
</figure>
<text><location><page_1><loc_52><loc_46><loc_53><loc_47></location>c.</text>
<text><location><page_1><loc_54><loc_46><loc_80><loc_47></location>Structure predicted by TableFormer:</text>
<caption><location><page_1><loc_52><loc_37><loc_88><loc_45></location>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
<caption><location><page_1><loc_50><loc_29><loc_89><loc_35></location>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
<table>
<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
<row_0><col_0><body>0</col_0><col_1><body>1</col_1><col_2><body>1</col_2><col_3><body>2 1</col_3><col_4><body>2 1</col_4><col_5><body></col_5></row_0>
<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
<row_0><col_0><col_header>0</col_0><col_1><col_header>1</col_1><col_2><col_header>1</col_2><col_3><col_header>2 1</col_3><col_4><col_header>2 1</col_4><col_5><body></col_5></row_0>
<row_1><col_0><body>3</col_0><col_1><body>4</col_1><col_2><body>5 3</col_2><col_3><body>6</col_3><col_4><body>7</col_4><col_5><body></col_5></row_1>
<row_2><col_0><body>8</col_0><col_1><body>9</col_1><col_2><body>10</col_2><col_3><body>11</col_3><col_4><body>12</col_4><col_5><body>2</col_5></row_2>
<row_3><col_0><body></col_0><col_1><body>13</col_1><col_2><body>14</col_2><col_3><body>15</col_3><col_4><body>16</col_4><col_5><body>2</col_5></row_3>
@ -51,6 +53,7 @@
<caption><location><page_3><loc_50><loc_64><loc_89><loc_66></location>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
<figure>
<location><page_3><loc_51><loc_68><loc_90><loc_90></location>
<caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
</figure>
<text><location><page_3><loc_50><loc_59><loc_71><loc_60></location>balance in the previous datasets.</text>
<text><location><page_3><loc_50><loc_21><loc_89><loc_58></location>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</text>
@ -60,16 +63,17 @@
<text><location><page_4><loc_8><loc_45><loc_47><loc_60></location>As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.</text>
<text><location><page_4><loc_8><loc_21><loc_47><loc_45></location>Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.</text>
<text><location><page_4><loc_8><loc_10><loc_47><loc_20></location>In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third</text>
<caption><location><page_4><loc_51><loc_80><loc_89><loc_91></location>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
<caption><location><page_4><loc_50><loc_72><loc_89><loc_79></location>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
<table>
<location><page_4><loc_51><loc_80><loc_89><loc_91></location>
<row_0><col_0><body></col_0><col_1><body>Tags</col_1><col_2><body>Bbox</col_2><col_3><body>Size</col_3><col_4><body>Format</col_4></row_0>
<row_1><col_0><body>PubTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>509k</col_3><col_4><body>PNG</col_4></row_1>
<row_2><col_0><body>FinTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>112k</col_3><col_4><body>PDF</col_4></row_2>
<row_3><col_0><body>TableBank</col_0><col_1><body>3</col_1><col_2><body>7</col_2><col_3><body>145k</col_3><col_4><body>JPEG</col_4></row_3>
<row_4><col_0><body>Combined-Tabnet(*)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>400k</col_3><col_4><body>PNG</col_4></row_4>
<row_5><col_0><body>Combined(**)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>500k</col_3><col_4><body>PNG</col_4></row_5>
<row_6><col_0><body>SynthTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>600k</col_3><col_4><body>PNG</col_4></row_6>
<caption>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
<row_0><col_0><body></col_0><col_1><col_header>Tags</col_1><col_2><col_header>Bbox</col_2><col_3><col_header>Size</col_3><col_4><col_header>Format</col_4></row_0>
<row_1><col_0><row_header>PubTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>509k</col_3><col_4><body>PNG</col_4></row_1>
<row_2><col_0><row_header>FinTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>112k</col_3><col_4><body>PDF</col_4></row_2>
<row_3><col_0><row_header>TableBank</col_0><col_1><body>3</col_1><col_2><body>7</col_2><col_3><body>145k</col_3><col_4><body>JPEG</col_4></row_3>
<row_4><col_0><row_header>Combined-Tabnet(*)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>400k</col_3><col_4><body>PNG</col_4></row_4>
<row_5><col_0><row_header>Combined(**)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>500k</col_3><col_4><body>PNG</col_4></row_5>
<row_6><col_0><row_header>SynthTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>600k</col_3><col_4><body>PNG</col_4></row_6>
</table>
<text><location><page_4><loc_50><loc_63><loc_89><loc_69></location>one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.</text>
<text><location><page_4><loc_52><loc_61><loc_89><loc_62></location>Tab. 1 summarizes the various attributes of the datasets.</text>
@ -81,10 +85,12 @@
<caption><location><page_5><loc_8><loc_72><loc_89><loc_74></location>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
<figure>
<location><page_5><loc_12><loc_77><loc_85><loc_90></location>
<caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
</figure>
<caption><location><page_5><loc_8><loc_14><loc_47><loc_33></location>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
<figure>
<location><page_5><loc_9><loc_36><loc_47><loc_67></location>
<caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
</figure>
<text><location><page_5><loc_50><loc_63><loc_89><loc_69></location>forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .</text>
<text><location><page_5><loc_50><loc_48><loc_89><loc_63></location>Structure Decoder. The transformer architecture of this component is based on the work proposed in [31]. After extensive experimentation, the Structure Decoder is modeled as a transformer encoder with two encoder layers and a transformer decoder made from a stack of 4 decoder layers that comprise mainly of multi-head attention and feed forward layers. This configuration uses fewer layers and heads in comparison to networks applied to other problems (e.g. "Scene Understanding", "Image Captioning"), something which we relate to the simplicity of table images.</text>
@ -115,43 +121,46 @@
<text><location><page_7><loc_8><loc_73><loc_47><loc_77></location>where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .</text>
<section_header><location><page_7><loc_8><loc_70><loc_28><loc_72></location>5.4. Quantitative Analysis</section_header>
<text><location><page_7><loc_8><loc_50><loc_47><loc_69></location>Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.</text>
<caption><location><page_7><loc_9><loc_26><loc_46><loc_48></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
<caption><location><page_7><loc_8><loc_23><loc_47><loc_25></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
<table>
<location><page_7><loc_9><loc_26><loc_46><loc_48></location>
<row_0><col_0><body>Model</col_0><col_1><body>Dataset</col_1><col_2><body>Simple</col_2><col_3><body>TEDS Complex</col_3><col_4><body>All</col_4></row_0>
<row_1><col_0><body>EDD</col_0><col_1><body>PTN</col_1><col_2><body>91.1</col_2><col_3><body>88.7</col_3><col_4><body>89.9</col_4></row_1>
<row_2><col_0><body>GTE</col_0><col_1><body>PTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>93.01</col_4></row_2>
<row_3><col_0><body>TableFormer</col_0><col_1><body>PTN</col_1><col_2><body>98.5</col_2><col_3><body>95.0</col_3><col_4><body>96.75</col_4></row_3>
<row_4><col_0><body>EDD</col_0><col_1><body>FTN</col_1><col_2><body>88.4</col_2><col_3><body>92.08</col_3><col_4><body>90.6</col_4></row_4>
<row_5><col_0><body>GTE</col_0><col_1><body>FTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>87.14</col_4></row_5>
<row_6><col_0><body>GTE (FT)</col_0><col_1><body>FTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>91.02</col_4></row_6>
<row_7><col_0><body>TableFormer</col_0><col_1><body>FTN</col_1><col_2><body>97.5</col_2><col_3><body>96.0</col_3><col_4><body>96.8</col_4></row_7>
<row_8><col_0><body>EDD</col_0><col_1><body>TB</col_1><col_2><body>86.0</col_2><col_3><body>-</col_3><col_4><body>86.0</col_4></row_8>
<row_9><col_0><body>TableFormer</col_0><col_1><body>TB</col_1><col_2><body>89.6</col_2><col_3><body>-</col_3><col_4><body>89.6</col_4></row_9>
<row_10><col_0><body>TableFormer</col_0><col_1><body>STN</col_1><col_2><body>96.9</col_2><col_3><body>95.7</col_3><col_4><body>96.7</col_4></row_10>
<caption>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
<row_0><col_0><col_header>Model</col_0><col_1><col_header>Dataset</col_1><col_2><col_header>Simple</col_2><col_3><col_header>TEDS Complex</col_3><col_4><col_header>All</col_4></row_0>
<row_1><col_0><row_header>EDD</col_0><col_1><body>PTN</col_1><col_2><body>91.1</col_2><col_3><body>88.7</col_3><col_4><body>89.9</col_4></row_1>
<row_2><col_0><row_header>GTE</col_0><col_1><body>PTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>93.01</col_4></row_2>
<row_3><col_0><row_header>TableFormer</col_0><col_1><body>PTN</col_1><col_2><body>98.5</col_2><col_3><body>95.0</col_3><col_4><body>96.75</col_4></row_3>
<row_4><col_0><row_header>EDD</col_0><col_1><body>FTN</col_1><col_2><body>88.4</col_2><col_3><body>92.08</col_3><col_4><body>90.6</col_4></row_4>
<row_5><col_0><row_header>GTE</col_0><col_1><body>FTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>87.14</col_4></row_5>
<row_6><col_0><row_header>GTE (FT)</col_0><col_1><body>FTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>91.02</col_4></row_6>
<row_7><col_0><row_header>TableFormer</col_0><col_1><body>FTN</col_1><col_2><body>97.5</col_2><col_3><body>96.0</col_3><col_4><body>96.8</col_4></row_7>
<row_8><col_0><row_header>EDD</col_0><col_1><body>TB</col_1><col_2><body>86.0</col_2><col_3><body>-</col_3><col_4><body>86.0</col_4></row_8>
<row_9><col_0><row_header>TableFormer</col_0><col_1><body>TB</col_1><col_2><body>89.6</col_2><col_3><body>-</col_3><col_4><body>89.6</col_4></row_9>
<row_10><col_0><row_header>TableFormer</col_0><col_1><body>STN</col_1><col_2><body>96.9</col_2><col_3><body>95.7</col_3><col_4><body>96.7</col_4></row_10>
</table>
<text><location><page_7><loc_8><loc_21><loc_43><loc_22></location>FT: Model was trained on PubTabNet then finetuned.</text>
<text><location><page_7><loc_8><loc_10><loc_47><loc_19></location>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate</text>
<text><location><page_7><loc_50><loc_71><loc_89><loc_91></location>our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</text>
<caption><location><page_7><loc_50><loc_62><loc_87><loc_69></location>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
<caption><location><page_7><loc_50><loc_57><loc_89><loc_60></location>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
<table>
<location><page_7><loc_50><loc_62><loc_87><loc_69></location>
<row_0><col_0><body>Model</col_0><col_1><body>Dataset</col_1><col_2><body>mAP</col_2><col_3><body>mAP (PP)</col_3></row_0>
<caption>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
<row_0><col_0><col_header>Model</col_0><col_1><col_header>Dataset</col_1><col_2><col_header>mAP</col_2><col_3><col_header>mAP (PP)</col_3></row_0>
<row_1><col_0><body>EDD+BBox</col_0><col_1><body>PubTabNet</col_1><col_2><body>79.2</col_2><col_3><body>82.7</col_3></row_1>
<row_2><col_0><body>TableFormer</col_0><col_1><body>PubTabNet</col_1><col_2><body>82.1</col_2><col_3><body>86.8</col_3></row_2>
<row_3><col_0><body>TableFormer</col_0><col_1><body>SynthTabNet</col_1><col_2><body>87.7</col_2><col_3><body>-</col_3></row_3>
</table>
<text><location><page_7><loc_50><loc_34><loc_89><loc_54></location>Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.</text>
<caption><location><page_7><loc_54><loc_19><loc_85><loc_32></location>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
<caption><location><page_7><loc_50><loc_13><loc_89><loc_17></location>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
<table>
<location><page_7><loc_54><loc_19><loc_85><loc_32></location>
<row_0><col_0><body>Model</col_0><col_1><body>Simple</col_1><col_2><body>TEDS Complex</col_2><col_3><body>All</col_3></row_0>
<row_1><col_0><body>Tabula</col_0><col_1><body>78.0</col_1><col_2><body>57.8</col_2><col_3><body>67.9</col_3></row_1>
<row_2><col_0><body>Traprange</col_0><col_1><body>60.8</col_1><col_2><body>49.9</col_2><col_3><body>55.4</col_3></row_2>
<row_3><col_0><body>Camelot</col_0><col_1><body>80.0</col_1><col_2><body>66.0</col_2><col_3><body>73.0</col_3></row_3>
<row_4><col_0><body>Acrobat Pro</col_0><col_1><body>68.9</col_1><col_2><body>61.8</col_2><col_3><body>65.3</col_3></row_4>
<row_5><col_0><body>EDD</col_0><col_1><body>91.2</col_1><col_2><body>85.4</col_2><col_3><body>88.3</col_3></row_5>
<row_6><col_0><body>TableFormer</col_0><col_1><body>95.4</col_1><col_2><body>90.1</col_2><col_3><body>93.6</col_3></row_6>
<caption>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
<row_0><col_0><body>Model</col_0><col_1><col_header>Simple</col_1><col_2><col_header>TEDS Complex</col_2><col_3><col_header>All</col_3></row_0>
<row_1><col_0><row_header>Tabula</col_0><col_1><body>78.0</col_1><col_2><body>57.8</col_2><col_3><body>67.9</col_3></row_1>
<row_2><col_0><row_header>Traprange</col_0><col_1><body>60.8</col_1><col_2><body>49.9</col_2><col_3><body>55.4</col_3></row_2>
<row_3><col_0><row_header>Camelot</col_0><col_1><body>80.0</col_1><col_2><body>66.0</col_2><col_3><body>73.0</col_3></row_3>
<row_4><col_0><row_header>Acrobat Pro</col_0><col_1><body>68.9</col_1><col_2><body>61.8</col_2><col_3><body>65.3</col_3></row_4>
<row_5><col_0><row_header>EDD</col_0><col_1><body>91.2</col_1><col_2><body>85.4</col_2><col_3><body>88.3</col_3></row_5>
<row_6><col_0><row_header>TableFormer</col_0><col_1><body>95.4</col_1><col_2><body>90.1</col_2><col_3><body>93.6</col_3></row_6>
</table>
<text><location><page_8><loc_9><loc_89><loc_10><loc_90></location>a.</text>
<text><location><page_8><loc_11><loc_89><loc_82><loc_90></location>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</text>
@ -163,31 +172,33 @@
<text><location><page_8><loc_11><loc_73><loc_63><loc_74></location>Structure predicted by TableFormer, with superimposed matched PDF cell text:</text>
<table>
<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><body>論文ファイル</col_2><col_3><body>論文ファイル</col_3><col_4><body>参考文献</col_4><col_5><body>参考文献</col_5></row_0>
<row_1><col_0><body>出典</col_0><col_1><body>ファイル 数</col_1><col_2><body>英語</col_2><col_3><body>日本語</col_3><col_4><body>英語</col_4><col_5><body>日本語</col_5></row_1>
<row_2><col_0><body>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
<row_3><col_0><body>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
<row_4><col_0><body>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
<row_5><col_0><body>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
<row_6><col_0><body>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
<row_7><col_0><body>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
<row_8><col_0><body>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
</table>
<caption><location><page_8><loc_50><loc_64><loc_90><loc_72></location>Text is aligned to match original for ease of viewing</caption>
<caption><location><page_8><loc_62><loc_62><loc_90><loc_63></location>Text is aligned to match original for ease of viewing</caption>
<table>
<location><page_8><loc_50><loc_64><loc_90><loc_72></location>
<row_0><col_0><body></col_0><col_1><body>Shares (in millions)</col_1><col_2><body>Shares (in millions)</col_2><col_3><body>Weighted Average Grant Date Fair Value</col_3><col_4><body>Weighted Average Grant Date Fair Value</col_4></row_0>
<row_1><col_0><body></col_0><col_1><body>RS U s</col_1><col_2><body>PSUs</col_2><col_3><body>RSUs</col_3><col_4><body>PSUs</col_4></row_1>
<row_2><col_0><body>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
<row_3><col_0><body>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
<row_4><col_0><body>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
<row_5><col_0><body>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
<row_6><col_0><body>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
<caption>Text is aligned to match original for ease of viewing</caption>
<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
</table>
<caption><location><page_8><loc_8><loc_54><loc_89><loc_60></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
<figure>
<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
<caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
</figure>
<figure>
<location><page_8><loc_63><loc_44><loc_89><loc_52></location>
@ -195,6 +206,7 @@
<caption><location><page_8><loc_10><loc_41><loc_87><loc_42></location>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
<figure>
<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
</figure>
<section_header><location><page_8><loc_8><loc_37><loc_27><loc_38></location>5.5. Qualitative Analysis</section_header>
<text><location><page_8><loc_8><loc_10><loc_47><loc_32></location>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</text>
@ -265,6 +277,7 @@
<caption><location><page_12><loc_8><loc_76><loc_89><loc_79></location>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
<figure>
<location><page_12><loc_9><loc_81><loc_89><loc_91></location>
<caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
</figure>
<list_item><location><page_12><loc_10><loc_71><loc_47><loc_73></location>· TableFormer output does not include the table cell content.</list_item>
<list_item><location><page_12><loc_10><loc_67><loc_47><loc_69></location>· There are occasional inaccuracies in the predictions of the bounding boxes.</list_item>
@ -295,22 +308,27 @@
<caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
<figure>
<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
</figure>
<caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
<figure>
<location><page_13><loc_55><loc_16><loc_85><loc_25></location>
<caption>Figure 10: Example of a complex table with empty cells.</caption>
</figure>
<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
<figure>
<location><page_14><loc_9><loc_81><loc_27><loc_86></location>
<caption>Figure 14: Example with multi-line text.</caption>
</figure>
<caption><location><page_14><loc_8><loc_52><loc_47><loc_55></location>Figure 11: Simple table with different style and empty cells.</caption>
<figure>
<location><page_14><loc_9><loc_68><loc_27><loc_73></location>
<caption>Figure 11: Simple table with different style and empty cells.</caption>
</figure>
<caption><location><page_14><loc_9><loc_14><loc_46><loc_15></location>Figure 12: Simple table predictions and post processing.</caption>
<figure>
<location><page_14><loc_8><loc_17><loc_29><loc_23></location>
<caption>Figure 12: Simple table predictions and post processing.</caption>
</figure>
<figure>
<location><page_14><loc_52><loc_81><loc_87><loc_88></location>
@ -321,10 +339,12 @@
<caption><location><page_14><loc_52><loc_52><loc_88><loc_53></location>Figure 13: Table predictions example on colorful table.</caption>
<figure>
<location><page_14><loc_54><loc_55><loc_86><loc_64></location>
<caption>Figure 13: Table predictions example on colorful table.</caption>
</figure>
<caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
<figure>
<location><page_15><loc_9><loc_69><loc_46><loc_83></location>
<caption>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
</figure>
<figure>
<location><page_15><loc_9><loc_37><loc_46><loc_51></location>
@ -332,9 +352,11 @@
<caption><location><page_15><loc_14><loc_17><loc_41><loc_19></location>Figure 15: Example with triangular table.</caption>
<figure>
<location><page_15><loc_8><loc_20><loc_52><loc_36></location>
<caption>Figure 15: Example with triangular table.</caption>
</figure>
<caption><location><page_16><loc_8><loc_33><loc_89><loc_36></location>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
<figure>
<location><page_16><loc_11><loc_37><loc_86><loc_68></location>
<caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
</figure>
</document>

File diff suppressed because one or more lines are too long

View File

@ -14,6 +14,8 @@ The occurrence of tables in documents is ubiquitous. They often summarise quanti
Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
b. Red-annotation of bounding boxes, Blue-predictions by TableFormer
<!-- image -->
@ -24,6 +26,7 @@ Structure predicted by TableFormer:
Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.
| 0 | 1 | 1 | 2 1 | 2 1 | |
|-----|-----|-----|-------|-------|----|
| 3 | 4 | 5 3 | 6 | 7 | |
@ -77,6 +80,7 @@ We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and Tabl
Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets
<!-- image -->
balance in the previous datasets.
@ -97,6 +101,7 @@ In this regard, we have prepared four synthetic datasets, each one containing 15
Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.
| | Tags | Bbox | Size | Format |
|--------------------|--------|--------|--------|----------|
| PubTabNet | 3 | 3 | 509k | PNG |
@ -122,10 +127,12 @@ CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table im
Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.
<!-- image -->
Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.
<!-- image -->
forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .
@ -188,6 +195,7 @@ Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across d
Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
| Model | Dataset | Simple | TEDS Complex | All |
|-------------|-----------|----------|----------------|-------|
| EDD | PTN | 91.1 | 88.7 | 89.9 |
@ -209,6 +217,7 @@ our Cell BBox Decoder accuracy for cells with a class label of 'content' only us
Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.
| Model | Dataset | mAP | mAP (PP) |
|-------------|-------------|-------|------------|
| EDD+BBox | PubTabNet | 79.2 | 82.7 |
@ -219,6 +228,7 @@ Cell Content. In this section, we evaluate the entire pipeline of recovering a t
Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.
| Model | Simple | TEDS Complex | All |
|-------------|----------|----------------|-------|
| Tabula | 78 | 57.8 | 67.9 |
@ -254,6 +264,7 @@ Structure predicted by TableFormer, with superimposed matched PDF cell text:
Text is aligned to match original for ease of viewing
| | Shares (in millions) | Shares (in millions) | Weighted Average Grant Date Fair Value | Weighted Average Grant Date Fair Value |
|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
| | RS U s | PSUs | RSUs | PSUs |
@ -265,12 +276,14 @@ Text is aligned to match original for ease of viewing
Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
<!-- image -->
<!-- image -->
Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
<!-- image -->
## 5.5. Qualitative Analysis
@ -403,6 +416,7 @@ Although TableFormer can predict the table structure and the bounding boxes for
Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.
<!-- image -->
· TableFormer output does not include the table cell content.
@ -459,22 +473,27 @@ Figure 8: Example of a table with multi-line header.
Figure 9: Example of a table with big empty distance between cells.
<!-- image -->
Figure 10: Example of a complex table with empty cells.
<!-- image -->
Figure 14: Example with multi-line text.
<!-- image -->
Figure 11: Simple table with different style and empty cells.
<!-- image -->
Figure 12: Simple table predictions and post processing.
<!-- image -->
<!-- image -->
@ -483,18 +502,22 @@ Figure 12: Simple table predictions and post processing.
Figure 13: Table predictions example on colorful table.
<!-- image -->
Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
<!-- image -->
<!-- image -->
Figure 15: Example with triangular table.
<!-- image -->
Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.
<!-- image -->

View File

@ -15,6 +15,7 @@
<caption><location><page_1><loc_52><loc_29><loc_91><loc_32></location>Figure 1: Four examples of complex page layouts across different document categories</caption>
<figure>
<location><page_1><loc_52><loc_33><loc_72><loc_53></location>
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
</figure>
<figure>
<location><page_1><loc_65><loc_56><loc_75><loc_68></location>
@ -52,6 +53,7 @@
<caption><location><page_3><loc_9><loc_68><loc_48><loc_70></location>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
<figure>
<location><page_3><loc_14><loc_72><loc_43><loc_88></location>
<caption>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
</figure>
<text><location><page_3><loc_9><loc_54><loc_48><loc_64></location>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".</text>
<text><location><page_3><loc_9><loc_36><loc_48><loc_53></location>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</text>
@ -62,27 +64,29 @@
<text><location><page_3><loc_52><loc_26><loc_91><loc_66></location>Despite being cost-intense and far less scalable than automation, human annotation has several benefits over automated groundtruth generation. The first and most obvious reason to leverage human annotations is the freedom to annotate any type of document without requiring a programmatic source. For most PDF documents, the original source document is not available. The latter is not a hard constraint with human annotation, but it is for automated methods. A second reason to use human annotations is that the latter usually provide a more natural interpretation of the page layout. The human-interpreted layout can significantly deviate from the programmatic layout used in typesetting. For example, "invisible" tables might be used solely for aligning text paragraphs on columns. Such typesetting tricks might be interpreted by automated methods incorrectly as an actual table, while the human annotation will interpret it correctly as Text or other styles. The same applies to multi-line text elements, when authors decided to space them as "invisible" list elements without bullet symbols. A third reason to gather ground-truth through human annotation is to estimate a "natural" upper bound on the segmentation accuracy. As we will show in Section 4, certain documents featuring complex layouts can have different but equally acceptable layout interpretations. This natural upper bound for segmentation accuracy can be found by annotating the same pages multiple times by different people and evaluating the inter-annotator agreement. Such a baseline consistency evaluation is very useful to define expectations for a good target accuracy in trained deep neural network models and avoid overfitting (see Table 1). On the flip side, achieving high annotation consistency proved to be a key challenge in human annotation, as we outline in Section 4.</text>
<section_header><location><page_3><loc_52><loc_22><loc_77><loc_23></location>4 ANNOTATION CAMPAIGN</section_header>
<text><location><page_3><loc_52><loc_11><loc_91><loc_20></location>The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,</text>
<caption><location><page_4><loc_16><loc_63><loc_84><loc_83></location>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption>
<caption><location><page_4><loc_9><loc_85><loc_91><loc_89></location>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption>
<table>
<location><page_4><loc_16><loc_63><loc_84><loc_83></location>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><body>% of Total</col_2><col_3><body>% of Total</col_3><col_4><body>% of Total</col_4><col_5><body>% of Total</col_5><col_6><body>triple inter-annotator mAP @ 0.5-0.95 (%)</col_6><col_7><body>triple inter-annotator mAP @ 0.5-0.95 (%)</col_7><col_8><body>triple inter-annotator mAP @ 0.5-0.95 (%)</col_8><col_9><body>triple inter-annotator mAP @ 0.5-0.95 (%)</col_9><col_10><body>triple inter-annotator mAP @ 0.5-0.95 (%)</col_10><col_11><body>triple inter-annotator mAP @ 0.5-0.95 (%)</col_11></row_0>
<row_1><col_0><body>class label</col_0><col_1><body>Count</col_1><col_2><body>Train</col_2><col_3><body>Test</col_3><col_4><body>Val</col_4><col_5><body>All</col_5><col_6><body>Fin</col_6><col_7><body>Man</col_7><col_8><body>Sci</col_8><col_9><body>Law</col_9><col_10><body>Pat</col_10><col_11><body>Ten</col_11></row_1>
<row_2><col_0><body>Caption</col_0><col_1><body>22524</col_1><col_2><body>2.04</col_2><col_3><body>1.77</col_3><col_4><body>2.32</col_4><col_5><body>84-89</col_5><col_6><body>40-61</col_6><col_7><body>86-92</col_7><col_8><body>94-99</col_8><col_9><body>95-99</col_9><col_10><body>69-78</col_10><col_11><body>n/a</col_11></row_2>
<row_3><col_0><body>Footnote</col_0><col_1><body>6318</col_1><col_2><body>0.60</col_2><col_3><body>0.31</col_3><col_4><body>0.58</col_4><col_5><body>83-91</col_5><col_6><body>n/a</col_6><col_7><body>100</col_7><col_8><body>62-88</col_8><col_9><body>85-94</col_9><col_10><body>n/a</col_10><col_11><body>82-97</col_11></row_3>
<row_4><col_0><body>Formula</col_0><col_1><body>25027</col_1><col_2><body>2.25</col_2><col_3><body>1.90</col_3><col_4><body>2.96</col_4><col_5><body>83-85</col_5><col_6><body>n/a</col_6><col_7><body>n/a</col_7><col_8><body>84-87</col_8><col_9><body>86-96</col_9><col_10><body>n/a</col_10><col_11><body>n/a</col_11></row_4>
<row_5><col_0><body>List-item</col_0><col_1><body>185660</col_1><col_2><body>17.19</col_2><col_3><body>13.34</col_3><col_4><body>15.82</col_4><col_5><body>87-88</col_5><col_6><body>74-83</col_6><col_7><body>90-92</col_7><col_8><body>97-97</col_8><col_9><body>81-85</col_9><col_10><body>75-88</col_10><col_11><body>93-95</col_11></row_5>
<row_6><col_0><body>Page-footer</col_0><col_1><body>70878</col_1><col_2><body>6.51</col_2><col_3><body>5.58</col_3><col_4><body>6.00</col_4><col_5><body>93-94</col_5><col_6><body>88-90</col_6><col_7><body>95-96</col_7><col_8><body>100</col_8><col_9><body>92-97</col_9><col_10><body>100</col_10><col_11><body>96-98</col_11></row_6>
<row_7><col_0><body>Page-header</col_0><col_1><body>58022</col_1><col_2><body>5.10</col_2><col_3><body>6.70</col_3><col_4><body>5.06</col_4><col_5><body>85-89</col_5><col_6><body>66-76</col_6><col_7><body>90-94</col_7><col_8><body>98-100</col_8><col_9><body>91-92</col_9><col_10><body>97-99</col_10><col_11><body>81-86</col_11></row_7>
<row_8><col_0><body>Picture</col_0><col_1><body>45976</col_1><col_2><body>4.21</col_2><col_3><body>2.78</col_3><col_4><body>5.31</col_4><col_5><body>69-71</col_5><col_6><body>56-59</col_6><col_7><body>82-86</col_7><col_8><body>69-82</col_8><col_9><body>80-95</col_9><col_10><body>66-71</col_10><col_11><body>59-76</col_11></row_8>
<row_9><col_0><body>Section-header</col_0><col_1><body>142884</col_1><col_2><body>12.60</col_2><col_3><body>15.77</col_3><col_4><body>12.85</col_4><col_5><body>83-84</col_5><col_6><body>76-81</col_6><col_7><body>90-92</col_7><col_8><body>94-95</col_8><col_9><body>87-94</col_9><col_10><body>69-73</col_10><col_11><body>78-86</col_11></row_9>
<row_10><col_0><body>Table</col_0><col_1><body>34733</col_1><col_2><body>3.20</col_2><col_3><body>2.27</col_3><col_4><body>3.60</col_4><col_5><body>77-81</col_5><col_6><body>75-80</col_6><col_7><body>83-86</col_7><col_8><body>98-99</col_8><col_9><body>58-80</col_9><col_10><body>79-84</col_10><col_11><body>70-85</col_11></row_10>
<row_11><col_0><body>Text</col_0><col_1><body>510377</col_1><col_2><body>45.82</col_2><col_3><body>49.28</col_3><col_4><body>45.00</col_4><col_5><body>84-86</col_5><col_6><body>81-86</col_6><col_7><body>88-93</col_7><col_8><body>89-93</col_8><col_9><body>87-92</col_9><col_10><body>71-79</col_10><col_11><body>87-95</col_11></row_11>
<row_12><col_0><body>Title</col_0><col_1><body>5071</col_1><col_2><body>0.47</col_2><col_3><body>0.30</col_3><col_4><body>0.50</col_4><col_5><body>60-72</col_5><col_6><body>24-63</col_6><col_7><body>50-63</col_7><col_8><body>94-100</col_8><col_9><body>82-96</col_9><col_10><body>68-79</col_10><col_11><body>24-56</col_11></row_12>
<row_13><col_0><body>Total</col_0><col_1><body>1107470</col_1><col_2><body>941123</col_2><col_3><body>99816</col_3><col_4><body>66531</col_4><col_5><body>82-83</col_5><col_6><body>71-74</col_6><col_7><body>79-81</col_7><col_8><body>89-94</col_8><col_9><body>86-91</col_9><col_10><body>71-76</col_10><col_11><body>68-85</col_11></row_13>
<caption>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>% of Total</col_2><col_3><col_header>% of Total</col_3><col_4><col_header>% of Total</col_4><col_5><col_header>% of Total</col_5><col_6><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_6><col_7><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_7><col_8><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_8><col_9><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_9><col_10><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_10><col_11><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_11></row_0>
<row_1><col_0><col_header>class label</col_0><col_1><col_header>Count</col_1><col_2><col_header>Train</col_2><col_3><col_header>Test</col_3><col_4><col_header>Val</col_4><col_5><col_header>All</col_5><col_6><col_header>Fin</col_6><col_7><col_header>Man</col_7><col_8><col_header>Sci</col_8><col_9><col_header>Law</col_9><col_10><col_header>Pat</col_10><col_11><col_header>Ten</col_11></row_1>
<row_2><col_0><row_header>Caption</col_0><col_1><body>22524</col_1><col_2><body>2.04</col_2><col_3><body>1.77</col_3><col_4><body>2.32</col_4><col_5><body>84-89</col_5><col_6><body>40-61</col_6><col_7><body>86-92</col_7><col_8><body>94-99</col_8><col_9><body>95-99</col_9><col_10><body>69-78</col_10><col_11><body>n/a</col_11></row_2>
<row_3><col_0><row_header>Footnote</col_0><col_1><body>6318</col_1><col_2><body>0.60</col_2><col_3><body>0.31</col_3><col_4><body>0.58</col_4><col_5><body>83-91</col_5><col_6><body>n/a</col_6><col_7><body>100</col_7><col_8><body>62-88</col_8><col_9><body>85-94</col_9><col_10><body>n/a</col_10><col_11><body>82-97</col_11></row_3>
<row_4><col_0><row_header>Formula</col_0><col_1><body>25027</col_1><col_2><body>2.25</col_2><col_3><body>1.90</col_3><col_4><body>2.96</col_4><col_5><body>83-85</col_5><col_6><body>n/a</col_6><col_7><body>n/a</col_7><col_8><body>84-87</col_8><col_9><body>86-96</col_9><col_10><body>n/a</col_10><col_11><body>n/a</col_11></row_4>
<row_5><col_0><row_header>List-item</col_0><col_1><body>185660</col_1><col_2><body>17.19</col_2><col_3><body>13.34</col_3><col_4><body>15.82</col_4><col_5><body>87-88</col_5><col_6><body>74-83</col_6><col_7><body>90-92</col_7><col_8><body>97-97</col_8><col_9><body>81-85</col_9><col_10><body>75-88</col_10><col_11><body>93-95</col_11></row_5>
<row_6><col_0><row_header>Page-footer</col_0><col_1><body>70878</col_1><col_2><body>6.51</col_2><col_3><body>5.58</col_3><col_4><body>6.00</col_4><col_5><body>93-94</col_5><col_6><body>88-90</col_6><col_7><body>95-96</col_7><col_8><body>100</col_8><col_9><body>92-97</col_9><col_10><body>100</col_10><col_11><body>96-98</col_11></row_6>
<row_7><col_0><row_header>Page-header</col_0><col_1><body>58022</col_1><col_2><body>5.10</col_2><col_3><body>6.70</col_3><col_4><body>5.06</col_4><col_5><body>85-89</col_5><col_6><body>66-76</col_6><col_7><body>90-94</col_7><col_8><body>98-100</col_8><col_9><body>91-92</col_9><col_10><body>97-99</col_10><col_11><body>81-86</col_11></row_7>
<row_8><col_0><row_header>Picture</col_0><col_1><body>45976</col_1><col_2><body>4.21</col_2><col_3><body>2.78</col_3><col_4><body>5.31</col_4><col_5><body>69-71</col_5><col_6><body>56-59</col_6><col_7><body>82-86</col_7><col_8><body>69-82</col_8><col_9><body>80-95</col_9><col_10><body>66-71</col_10><col_11><body>59-76</col_11></row_8>
<row_9><col_0><row_header>Section-header</col_0><col_1><body>142884</col_1><col_2><body>12.60</col_2><col_3><body>15.77</col_3><col_4><body>12.85</col_4><col_5><body>83-84</col_5><col_6><body>76-81</col_6><col_7><body>90-92</col_7><col_8><body>94-95</col_8><col_9><body>87-94</col_9><col_10><body>69-73</col_10><col_11><body>78-86</col_11></row_9>
<row_10><col_0><row_header>Table</col_0><col_1><body>34733</col_1><col_2><body>3.20</col_2><col_3><body>2.27</col_3><col_4><body>3.60</col_4><col_5><body>77-81</col_5><col_6><body>75-80</col_6><col_7><body>83-86</col_7><col_8><body>98-99</col_8><col_9><body>58-80</col_9><col_10><body>79-84</col_10><col_11><body>70-85</col_11></row_10>
<row_11><col_0><row_header>Text</col_0><col_1><body>510377</col_1><col_2><body>45.82</col_2><col_3><body>49.28</col_3><col_4><body>45.00</col_4><col_5><body>84-86</col_5><col_6><body>81-86</col_6><col_7><body>88-93</col_7><col_8><body>89-93</col_8><col_9><body>87-92</col_9><col_10><body>71-79</col_10><col_11><body>87-95</col_11></row_11>
<row_12><col_0><row_header>Title</col_0><col_1><body>5071</col_1><col_2><body>0.47</col_2><col_3><body>0.30</col_3><col_4><body>0.50</col_4><col_5><body>60-72</col_5><col_6><body>24-63</col_6><col_7><body>50-63</col_7><col_8><body>94-100</col_8><col_9><body>82-96</col_9><col_10><body>68-79</col_10><col_11><body>24-56</col_11></row_12>
<row_13><col_0><row_header>Total</col_0><col_1><body>1107470</col_1><col_2><body>941123</col_2><col_3><body>99816</col_3><col_4><body>66531</col_4><col_5><body>82-83</col_5><col_6><body>71-74</col_6><col_7><body>79-81</col_7><col_8><body>89-94</col_8><col_9><body>86-91</col_9><col_10><body>71-76</col_10><col_11><body>68-85</col_11></row_13>
</table>
<caption><location><page_4><loc_9><loc_23><loc_48><loc_30></location>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
<figure>
<location><page_4><loc_9><loc_32><loc_48><loc_61></location>
<caption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
</figure>
<text><location><page_4><loc_9><loc_15><loc_48><loc_20></location>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</text>
<text><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources</text>
@ -103,26 +107,28 @@
<caption><location><page_5><loc_52><loc_36><loc_91><loc_40></location>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
<figure>
<location><page_5><loc_52><loc_42><loc_91><loc_89></location>
<caption>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
</figure>
<text><location><page_5><loc_52><loc_31><loc_91><loc_34></location>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</text>
<text><location><page_5><loc_52><loc_10><loc_91><loc_31></location>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</text>
<caption><location><page_6><loc_10><loc_56><loc_47><loc_75></location>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
<caption><location><page_6><loc_9><loc_77><loc_48><loc_89></location>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
<table>
<location><page_6><loc_10><loc_56><loc_47><loc_75></location>
<row_0><col_0><body></col_0><col_1><body>human</col_1><col_2><body>MRCNN</col_2><col_3><body>MRCNN</col_3><col_4><body>FRCNN</col_4><col_5><body>YOLO</col_5></row_0>
<row_1><col_0><body></col_0><col_1><body>human</col_1><col_2><body>R50</col_2><col_3><body>R101</col_3><col_4><body>R101</col_4><col_5><body>v5x6</col_5></row_1>
<row_2><col_0><body>Caption</col_0><col_1><body>84-89</col_1><col_2><body>68.4</col_2><col_3><body>71.5</col_3><col_4><body>70.1</col_4><col_5><body>77.7</col_5></row_2>
<row_3><col_0><body>Footnote</col_0><col_1><body>83-91</col_1><col_2><body>70.9</col_2><col_3><body>71.8</col_3><col_4><body>73.7</col_4><col_5><body>77.2</col_5></row_3>
<row_4><col_0><body>Formula</col_0><col_1><body>83-85</col_1><col_2><body>60.1</col_2><col_3><body>63.4</col_3><col_4><body>63.5</col_4><col_5><body>66.2</col_5></row_4>
<row_5><col_0><body>List-item</col_0><col_1><body>87-88</col_1><col_2><body>81.2</col_2><col_3><body>80.8</col_3><col_4><body>81.0</col_4><col_5><body>86.2</col_5></row_5>
<row_6><col_0><body>Page-footer</col_0><col_1><body>93-94</col_1><col_2><body>61.6</col_2><col_3><body>59.3</col_3><col_4><body>58.9</col_4><col_5><body>61.1</col_5></row_6>
<row_7><col_0><body>Page-header</col_0><col_1><body>85-89</col_1><col_2><body>71.9</col_2><col_3><body>70.0</col_3><col_4><body>72.0</col_4><col_5><body>67.9</col_5></row_7>
<row_8><col_0><body>Picture</col_0><col_1><body>69-71</col_1><col_2><body>71.7</col_2><col_3><body>72.7</col_3><col_4><body>72.0</col_4><col_5><body>77.1</col_5></row_8>
<row_9><col_0><body>Section-header</col_0><col_1><body>83-84</col_1><col_2><body>67.6</col_2><col_3><body>69.3</col_3><col_4><body>68.4</col_4><col_5><body>74.6</col_5></row_9>
<row_10><col_0><body>Table</col_0><col_1><body>77-81</col_1><col_2><body>82.2</col_2><col_3><body>82.9</col_3><col_4><body>82.2</col_4><col_5><body>86.3</col_5></row_10>
<row_11><col_0><body>Text</col_0><col_1><body>84-86</col_1><col_2><body>84.6</col_2><col_3><body>85.8</col_3><col_4><body>85.4</col_4><col_5><body>88.1</col_5></row_11>
<row_12><col_0><body>Title</col_0><col_1><body>60-72</col_1><col_2><body>76.7</col_2><col_3><body>80.4</col_3><col_4><body>79.9</col_4><col_5><body>82.7</col_5></row_12>
<row_13><col_0><body>All</col_0><col_1><body>82-83</col_1><col_2><body>72.4</col_2><col_3><body>73.5</col_3><col_4><body>73.4</col_4><col_5><body>76.8</col_5></row_13>
<caption>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
<row_0><col_0><body></col_0><col_1><col_header>human</col_1><col_2><col_header>MRCNN</col_2><col_3><col_header>MRCNN</col_3><col_4><col_header>FRCNN</col_4><col_5><col_header>YOLO</col_5></row_0>
<row_1><col_0><body></col_0><col_1><col_header>human</col_1><col_2><col_header>R50</col_2><col_3><col_header>R101</col_3><col_4><col_header>R101</col_4><col_5><col_header>v5x6</col_5></row_1>
<row_2><col_0><row_header>Caption</col_0><col_1><body>84-89</col_1><col_2><body>68.4</col_2><col_3><body>71.5</col_3><col_4><body>70.1</col_4><col_5><body>77.7</col_5></row_2>
<row_3><col_0><row_header>Footnote</col_0><col_1><body>83-91</col_1><col_2><body>70.9</col_2><col_3><body>71.8</col_3><col_4><body>73.7</col_4><col_5><body>77.2</col_5></row_3>
<row_4><col_0><row_header>Formula</col_0><col_1><body>83-85</col_1><col_2><body>60.1</col_2><col_3><body>63.4</col_3><col_4><body>63.5</col_4><col_5><body>66.2</col_5></row_4>
<row_5><col_0><row_header>List-item</col_0><col_1><body>87-88</col_1><col_2><body>81.2</col_2><col_3><body>80.8</col_3><col_4><body>81.0</col_4><col_5><body>86.2</col_5></row_5>
<row_6><col_0><row_header>Page-footer</col_0><col_1><body>93-94</col_1><col_2><body>61.6</col_2><col_3><body>59.3</col_3><col_4><body>58.9</col_4><col_5><body>61.1</col_5></row_6>
<row_7><col_0><row_header>Page-header</col_0><col_1><body>85-89</col_1><col_2><body>71.9</col_2><col_3><body>70.0</col_3><col_4><body>72.0</col_4><col_5><body>67.9</col_5></row_7>
<row_8><col_0><row_header>Picture</col_0><col_1><body>69-71</col_1><col_2><body>71.7</col_2><col_3><body>72.7</col_3><col_4><body>72.0</col_4><col_5><body>77.1</col_5></row_8>
<row_9><col_0><row_header>Section-header</col_0><col_1><body>83-84</col_1><col_2><body>67.6</col_2><col_3><body>69.3</col_3><col_4><body>68.4</col_4><col_5><body>74.6</col_5></row_9>
<row_10><col_0><row_header>Table</col_0><col_1><body>77-81</col_1><col_2><body>82.2</col_2><col_3><body>82.9</col_3><col_4><body>82.2</col_4><col_5><body>86.3</col_5></row_10>
<row_11><col_0><row_header>Text</col_0><col_1><body>84-86</col_1><col_2><body>84.6</col_2><col_3><body>85.8</col_3><col_4><body>85.4</col_4><col_5><body>88.1</col_5></row_11>
<row_12><col_0><row_header>Title</col_0><col_1><body>60-72</col_1><col_2><body>76.7</col_2><col_3><body>80.4</col_3><col_4><body>79.9</col_4><col_5><body>82.7</col_5></row_12>
<row_13><col_0><row_header>All</col_0><col_1><body>82-83</col_1><col_2><body>72.4</col_2><col_3><body>73.5</col_3><col_4><body>73.4</col_4><col_5><body>76.8</col_5></row_13>
</table>
<text><location><page_6><loc_9><loc_27><loc_48><loc_53></location>to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.</text>
<section_header><location><page_6><loc_9><loc_24><loc_24><loc_26></location>5 EXPERIMENTS</section_header>
@ -130,28 +136,30 @@
<caption><location><page_6><loc_52><loc_57><loc_91><loc_65></location>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
<figure>
<location><page_6><loc_53><loc_67><loc_90><loc_89></location>
<caption>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
</figure>
<text><location><page_6><loc_52><loc_49><loc_91><loc_52></location>paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.</text>
<text><location><page_6><loc_52><loc_39><loc_91><loc_49></location>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</text>
<section_header><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</section_header>
<text><location><page_6><loc_52><loc_11><loc_91><loc_35></location>In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.</text>
<text><location><page_7><loc_9><loc_84><loc_48><loc_89></location>Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.</text>
<caption><location><page_7><loc_13><loc_63><loc_44><loc_81></location>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
<caption><location><page_7><loc_52><loc_84><loc_91><loc_89></location>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
<table>
<location><page_7><loc_13><loc_63><loc_44><loc_81></location>
<row_0><col_0><body>Class-count</col_0><col_1><body>11</col_1><col_2><body>6</col_2><col_3><body>5</col_3><col_4><body>4</col_4></row_0>
<row_1><col_0><body>Caption</col_0><col_1><body>68</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_1>
<row_2><col_0><body>Footnote</col_0><col_1><body>71</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_2>
<row_3><col_0><body>Formula</col_0><col_1><body>60</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_3>
<row_4><col_0><body>List-item</col_0><col_1><body>81</col_1><col_2><body>Text</col_2><col_3><body>82</col_3><col_4><body>Text</col_4></row_4>
<row_5><col_0><body>Page-footer</col_0><col_1><body>62</col_1><col_2><body>62</col_2><col_3><body>-</col_3><col_4><body>-</col_4></row_5>
<row_6><col_0><body>Page-header</col_0><col_1><body>72</col_1><col_2><body>68</col_2><col_3><body>-</col_3><col_4><body>-</col_4></row_6>
<row_7><col_0><body>Picture</col_0><col_1><body>72</col_1><col_2><body>72</col_2><col_3><body>72</col_3><col_4><body>72</col_4></row_7>
<row_8><col_0><body>Section-header</col_0><col_1><body>68</col_1><col_2><body>67</col_2><col_3><body>69</col_3><col_4><body>68</col_4></row_8>
<row_9><col_0><body>Table</col_0><col_1><body>82</col_1><col_2><body>83</col_2><col_3><body>82</col_3><col_4><body>82</col_4></row_9>
<row_10><col_0><body>Text</col_0><col_1><body>85</col_1><col_2><body>84</col_2><col_3><body>84</col_3><col_4><body>84</col_4></row_10>
<row_11><col_0><body>Title</col_0><col_1><body>77</col_1><col_2><body>Sec.-h.</col_2><col_3><body>Sec.-h.</col_3><col_4><body>Sec.-h.</col_4></row_11>
<row_12><col_0><body>Overall</col_0><col_1><body>72</col_1><col_2><body>73</col_2><col_3><body>78</col_3><col_4><body>77</col_4></row_12>
<caption>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
<row_0><col_0><col_header>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>6</col_2><col_3><col_header>5</col_3><col_4><col_header>4</col_4></row_0>
<row_1><col_0><row_header>Caption</col_0><col_1><body>68</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_1>
<row_2><col_0><row_header>Footnote</col_0><col_1><body>71</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_2>
<row_3><col_0><row_header>Formula</col_0><col_1><body>60</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_3>
<row_4><col_0><row_header>List-item</col_0><col_1><body>81</col_1><col_2><body>Text</col_2><col_3><body>82</col_3><col_4><body>Text</col_4></row_4>
<row_5><col_0><row_header>Page-footer</col_0><col_1><body>62</col_1><col_2><body>62</col_2><col_3><body>-</col_3><col_4><body>-</col_4></row_5>
<row_6><col_0><row_header>Page-header</col_0><col_1><body>72</col_1><col_2><body>68</col_2><col_3><body>-</col_3><col_4><body>-</col_4></row_6>
<row_7><col_0><row_header>Picture</col_0><col_1><body>72</col_1><col_2><body>72</col_2><col_3><body>72</col_3><col_4><body>72</col_4></row_7>
<row_8><col_0><row_header>Section-header</col_0><col_1><body>68</col_1><col_2><body>67</col_2><col_3><body>69</col_3><col_4><body>68</col_4></row_8>
<row_9><col_0><row_header>Table</col_0><col_1><body>82</col_1><col_2><body>83</col_2><col_3><body>82</col_3><col_4><body>82</col_4></row_9>
<row_10><col_0><row_header>Text</col_0><col_1><body>85</col_1><col_2><body>84</col_2><col_3><body>84</col_3><col_4><body>84</col_4></row_10>
<row_11><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>Sec.-h.</col_2><col_3><body>Sec.-h.</col_3><col_4><body>Sec.-h.</col_4></row_11>
<row_12><col_0><row_header>Overall</col_0><col_1><body>72</col_1><col_2><body>73</col_2><col_3><body>78</col_3><col_4><body>77</col_4></row_12>
</table>
<section_header><location><page_7><loc_9><loc_58><loc_21><loc_60></location>Learning Curve</section_header>
<text><location><page_7><loc_9><loc_33><loc_48><loc_58></location>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</text>
@ -159,44 +167,45 @@
<text><location><page_7><loc_9><loc_11><loc_48><loc_30></location>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</text>
<table>
<location><page_7><loc_58><loc_61><loc_85><loc_81></location>
<row_0><col_0><body>Class-count</col_0><col_1><body>11</col_1><col_2><body>11</col_2><col_3><body>5</col_3><col_4><body>5</col_4></row_0>
<row_1><col_0><body>Split</col_0><col_1><body>Doc</col_1><col_2><body>Page</col_2><col_3><body>Doc</col_3><col_4><body>Page</col_4></row_1>
<row_2><col_0><body>Caption</col_0><col_1><body>68</col_1><col_2><body>83</col_2><col_3><body></col_3><col_4><body></col_4></row_2>
<row_3><col_0><body>Footnote</col_0><col_1><body>71</col_1><col_2><body>84</col_2><col_3><body></col_3><col_4><body></col_4></row_3>
<row_4><col_0><body>Formula</col_0><col_1><body>60</col_1><col_2><body>66</col_2><col_3><body></col_3><col_4><body></col_4></row_4>
<row_5><col_0><body>List-item</col_0><col_1><body>81</col_1><col_2><body>88</col_2><col_3><body>82</col_3><col_4><body>88</col_4></row_5>
<row_6><col_0><body>Page-footer</col_0><col_1><body>62</col_1><col_2><body>89</col_2><col_3><body></col_3><col_4><body></col_4></row_6>
<row_7><col_0><body>Page-header</col_0><col_1><body>72</col_1><col_2><body>90</col_2><col_3><body></col_3><col_4><body></col_4></row_7>
<row_8><col_0><body>Picture</col_0><col_1><body>72</col_1><col_2><body>82</col_2><col_3><body>72</col_3><col_4><body>82</col_4></row_8>
<row_9><col_0><body>Section-header</col_0><col_1><body>68</col_1><col_2><body>83</col_2><col_3><body>69</col_3><col_4><body>83</col_4></row_9>
<row_10><col_0><body>Table</col_0><col_1><body>82</col_1><col_2><body>89</col_2><col_3><body>82</col_3><col_4><body>90</col_4></row_10>
<row_11><col_0><body>Text</col_0><col_1><body>85</col_1><col_2><body>91</col_2><col_3><body>84</col_3><col_4><body>90</col_4></row_11>
<row_12><col_0><body>Title</col_0><col_1><body>77</col_1><col_2><body>81</col_2><col_3><body></col_3><col_4><body></col_4></row_12>
<row_13><col_0><body>All</col_0><col_1><body>72</col_1><col_2><body>84</col_2><col_3><body>78</col_3><col_4><body>87</col_4></row_13>
<row_0><col_0><body>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>11</col_2><col_3><col_header>5</col_3><col_4><col_header>5</col_4></row_0>
<row_1><col_0><body>Split</col_0><col_1><col_header>Doc</col_1><col_2><col_header>Page</col_2><col_3><col_header>Doc</col_3><col_4><col_header>Page</col_4></row_1>
<row_2><col_0><row_header>Caption</col_0><col_1><body>68</col_1><col_2><body>83</col_2><col_3><body></col_3><col_4><body></col_4></row_2>
<row_3><col_0><row_header>Footnote</col_0><col_1><body>71</col_1><col_2><body>84</col_2><col_3><body></col_3><col_4><body></col_4></row_3>
<row_4><col_0><row_header>Formula</col_0><col_1><body>60</col_1><col_2><body>66</col_2><col_3><body></col_3><col_4><body></col_4></row_4>
<row_5><col_0><row_header>List-item</col_0><col_1><body>81</col_1><col_2><body>88</col_2><col_3><body>82</col_3><col_4><body>88</col_4></row_5>
<row_6><col_0><row_header>Page-footer</col_0><col_1><body>62</col_1><col_2><body>89</col_2><col_3><body></col_3><col_4><body></col_4></row_6>
<row_7><col_0><row_header>Page-header</col_0><col_1><body>72</col_1><col_2><body>90</col_2><col_3><body></col_3><col_4><body></col_4></row_7>
<row_8><col_0><row_header>Picture</col_0><col_1><body>72</col_1><col_2><body>82</col_2><col_3><body>72</col_3><col_4><body>82</col_4></row_8>
<row_9><col_0><row_header>Section-header</col_0><col_1><body>68</col_1><col_2><body>83</col_2><col_3><body>69</col_3><col_4><body>83</col_4></row_9>
<row_10><col_0><row_header>Table</col_0><col_1><body>82</col_1><col_2><body>89</col_2><col_3><body>82</col_3><col_4><body>90</col_4></row_10>
<row_11><col_0><row_header>Text</col_0><col_1><body>85</col_1><col_2><body>91</col_2><col_3><body>84</col_3><col_4><body>90</col_4></row_11>
<row_12><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>81</col_2><col_3><body></col_3><col_4><body></col_4></row_12>
<row_13><col_0><row_header>All</col_0><col_1><body>72</col_1><col_2><body>84</col_2><col_3><body>78</col_3><col_4><body>87</col_4></row_13>
</table>
<text><location><page_7><loc_52><loc_47><loc_91><loc_58></location>lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.</text>
<section_header><location><page_7><loc_52><loc_44><loc_90><loc_46></location>Impact of Document Split in Train and Test Set</section_header>
<text><location><page_7><loc_52><loc_25><loc_91><loc_44></location>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</text>
<section_header><location><page_7><loc_52><loc_22><loc_68><loc_23></location>Dataset Comparison</section_header>
<text><location><page_7><loc_52><loc_11><loc_91><loc_21></location>Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,</text>
<caption><location><page_8><loc_12><loc_57><loc_45><loc_78></location>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>
<caption><location><page_8><loc_9><loc_81><loc_48><loc_89></location>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>
<table>
<location><page_8><loc_12><loc_57><loc_45><loc_78></location>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><body>Testing on</col_2><col_3><body>Testing on</col_3><col_4><body>Testing on</col_4></row_0>
<row_1><col_0><body>Training on</col_0><col_1><body>labels</col_1><col_2><body>PLN</col_2><col_3><body>DB</col_3><col_4><body>DLN</col_4></row_1>
<row_2><col_0><body>PubLayNet (PLN)</col_0><col_1><body>Figure</col_1><col_2><body>96</col_2><col_3><body>43</col_3><col_4><body>23</col_4></row_2>
<row_3><col_0><body>PubLayNet (PLN)</col_0><col_1><body>Sec-header</col_1><col_2><body>87</col_2><col_3><body>-</col_3><col_4><body>32</col_4></row_3>
<row_4><col_0><body>PubLayNet (PLN)</col_0><col_1><body>Table</col_1><col_2><body>95</col_2><col_3><body>24</col_3><col_4><body>49</col_4></row_4>
<row_5><col_0><body>PubLayNet (PLN)</col_0><col_1><body>Text</col_1><col_2><body>96</col_2><col_3><body>-</col_3><col_4><body>42</col_4></row_5>
<row_6><col_0><body>PubLayNet (PLN)</col_0><col_1><body>total</col_1><col_2><body>93</col_2><col_3><body>34</col_3><col_4><body>30</col_4></row_6>
<row_7><col_0><body>DocBank (DB)</col_0><col_1><body>Figure</col_1><col_2><body>77</col_2><col_3><body>71</col_3><col_4><body>31</col_4></row_7>
<row_8><col_0><body>DocBank (DB)</col_0><col_1><body>Table</col_1><col_2><body>19</col_2><col_3><body>65</col_3><col_4><body>22</col_4></row_8>
<row_9><col_0><body>DocBank (DB)</col_0><col_1><body>total</col_1><col_2><body>48</col_2><col_3><body>68</col_3><col_4><body>27</col_4></row_9>
<row_10><col_0><body>DocLayNet (DLN)</col_0><col_1><body>Figure</col_1><col_2><body>67</col_2><col_3><body>51</col_3><col_4><body>72</col_4></row_10>
<row_11><col_0><body>DocLayNet (DLN)</col_0><col_1><body>Sec-header</col_1><col_2><body>53</col_2><col_3><body>-</col_3><col_4><body>68</col_4></row_11>
<row_12><col_0><body>DocLayNet (DLN)</col_0><col_1><body>Table</col_1><col_2><body>87</col_2><col_3><body>43</col_3><col_4><body>82</col_4></row_12>
<row_13><col_0><body>DocLayNet (DLN)</col_0><col_1><body>Text</col_1><col_2><body>77</col_2><col_3><body>-</col_3><col_4><body>84</col_4></row_13>
<row_14><col_0><body>DocLayNet (DLN)</col_0><col_1><body>total</col_1><col_2><body>59</col_2><col_3><body>47</col_3><col_4><body>78</col_4></row_14>
<caption>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>Testing on</col_2><col_3><col_header>Testing on</col_3><col_4><col_header>Testing on</col_4></row_0>
<row_1><col_0><col_header>Training on</col_0><col_1><col_header>labels</col_1><col_2><col_header>PLN</col_2><col_3><col_header>DB</col_3><col_4><col_header>DLN</col_4></row_1>
<row_2><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Figure</col_1><col_2><body>96</col_2><col_3><body>43</col_3><col_4><body>23</col_4></row_2>
<row_3><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Sec-header</col_1><col_2><body>87</col_2><col_3><body>-</col_3><col_4><body>32</col_4></row_3>
<row_4><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Table</col_1><col_2><body>95</col_2><col_3><body>24</col_3><col_4><body>49</col_4></row_4>
<row_5><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Text</col_1><col_2><body>96</col_2><col_3><body>-</col_3><col_4><body>42</col_4></row_5>
<row_6><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>total</col_1><col_2><body>93</col_2><col_3><body>34</col_3><col_4><body>30</col_4></row_6>
<row_7><col_0><row_header>DocBank (DB)</col_0><col_1><row_header>Figure</col_1><col_2><body>77</col_2><col_3><body>71</col_3><col_4><body>31</col_4></row_7>
<row_8><col_0><row_header>DocBank (DB)</col_0><col_1><row_header>Table</col_1><col_2><body>19</col_2><col_3><body>65</col_3><col_4><body>22</col_4></row_8>
<row_9><col_0><row_header>DocBank (DB)</col_0><col_1><row_header>total</col_1><col_2><body>48</col_2><col_3><body>68</col_3><col_4><body>27</col_4></row_9>
<row_10><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Figure</col_1><col_2><body>67</col_2><col_3><body>51</col_3><col_4><body>72</col_4></row_10>
<row_11><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Sec-header</col_1><col_2><body>53</col_2><col_3><body>-</col_3><col_4><body>68</col_4></row_11>
<row_12><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Table</col_1><col_2><body>87</col_2><col_3><body>43</col_3><col_4><body>82</col_4></row_12>
<row_13><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Text</col_1><col_2><body>77</col_2><col_3><body>-</col_3><col_4><body>84</col_4></row_13>
<row_14><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>total</col_1><col_2><body>59</col_2><col_3><body>47</col_3><col_4><body>78</col_4></row_14>
</table>
<text><location><page_8><loc_9><loc_44><loc_48><loc_51></location>Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text .</text>
<text><location><page_8><loc_9><loc_26><loc_48><loc_44></location>For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts.</text>
@ -223,6 +232,7 @@
<caption><location><page_9><loc_9><loc_43><loc_52><loc_44></location>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
<figure>
<location><page_9><loc_9><loc_44><loc_91><loc_89></location>
<caption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
</figure>
<text><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</text>
<text><location><page_9><loc_11><loc_31><loc_48><loc_34></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</text>

File diff suppressed because one or more lines are too long

View File

@ -26,6 +26,7 @@ KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the o
Figure 1: Four examples of complex page layouts across different document categories
<!-- image -->
<!-- image -->
@ -88,6 +89,7 @@ In addition to open intellectual property constraints for the source documents,
Figure 2: Distribution of DocLayNet pages across document categories.
<!-- image -->
to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".
@ -110,6 +112,7 @@ The annotation campaign was carried out in four phases. In phase one, we identif
Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.
| | | % of Total | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |
|----------------|---------|--------------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|
| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |
@ -128,6 +131,7 @@ Table 1: DocLayNet dataset overview. Along with the frequency of each class labe
Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.
<!-- image -->
we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.
@ -164,6 +168,7 @@ Phase 3: Training. After a first trial with a small group of people, we realised
Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
<!-- image -->
were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.
@ -172,6 +177,7 @@ Phase 4: Production annotation. The previously selected 80K pages were annotated
Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
| | human | MRCNN | MRCNN | FRCNN | YOLO |
|----------------|---------|---------|---------|---------|--------|
| | human | R50 | R101 | R101 | v5x6 |
@ -196,6 +202,7 @@ The primary goal of DocLayNet is to obtain high-quality ML models capable of acc
Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.
<!-- image -->
paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.
@ -210,6 +217,7 @@ Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.
| Class-count | 11 | 6 | 5 | 4 |
|----------------|------|---------|---------|---------|
| Caption | 68 | Text | Text | Text |
@ -261,6 +269,7 @@ Throughout this paper, we claim that DocLayNet's wider variety of document layou
Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
| | | Testing on | Testing on | Testing on |
|-----------------|------------|--------------|--------------|--------------|
| Training on | labels | PLN | DB | DLN |
@ -324,6 +333,7 @@ To date, there is still a significant gap between human and ML accuracy on the l
Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title
<!-- image -->
Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.

View File

@ -2,11 +2,12 @@
<text><location><page_1><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</text>
<section_header><location><page_1><loc_22><loc_77><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</section_header>
<text><location><page_1><loc_22><loc_68><loc_79><loc_77></location>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</text>
<caption><location><page_1><loc_23><loc_41><loc_78><loc_57></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
<caption><location><page_1><loc_22><loc_59><loc_79><loc_66></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
<table>
<location><page_1><loc_23><loc_41><loc_78><loc_57></location>
<row_0><col_0><body>#</col_0><col_1><body>#</col_1><col_2><body>Language</col_2><col_3><body>TEDs</col_3><col_4><body>TEDs</col_4><col_5><body>TEDs</col_5><col_6><body>mAP</col_6><col_7><body>Inference</col_7></row_0>
<row_1><col_0><body>enc-layers</col_0><col_1><body>dec-layers</col_1><col_2><body>Language</col_2><col_3><body>simple</col_3><col_4><body>complex</col_4><col_5><body>all</col_5><col_6><body>(0.75)</col_6><col_7><body>time (secs)</col_7></row_1>
<caption>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
<row_0><col_0><col_header>#</col_0><col_1><col_header>#</col_1><col_2><col_header>Language</col_2><col_3><col_header>TEDs</col_3><col_4><col_header>TEDs</col_4><col_5><col_header>TEDs</col_5><col_6><col_header>mAP</col_6><col_7><col_header>Inference</col_7></row_0>
<row_1><col_0><col_header>enc-layers</col_0><col_1><col_header>dec-layers</col_1><col_2><col_header>Language</col_2><col_3><col_header>simple</col_3><col_4><col_header>complex</col_4><col_5><col_header>all</col_5><col_6><col_header>(0.75)</col_6><col_7><col_header>time (secs)</col_7></row_1>
<row_2><col_0><body>6</col_0><col_1><body>6</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.965 0.969</col_3><col_4><body>0.934 0.927</col_4><col_5><body>0.955 0.955</col_5><col_6><body>0.88 0.857</col_6><col_7><body>2.73 5.39</col_7></row_2>
<row_3><col_0><body>4</col_0><col_1><body>4</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.938</col_3><col_4><body>0.904</col_4><col_5><body>0.927</col_5><col_6><body>0.853</col_6><col_7><body>1.97</col_7></row_3>
<row_4><col_0><body></col_0><col_1><body></col_1><col_2><body>OTSL</col_2><col_3><body>0.952 0.923</col_3><col_4><body>0.909</col_4><col_5><body>0.938</col_5><col_6><body>0.843</col_6><col_7><body>3.77</col_7></row_4>

File diff suppressed because one or more lines are too long

View File

@ -6,6 +6,7 @@ We have chosen the PubTabNet data set to perform HPO, since it includes a highly
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |

View File

@ -10,6 +10,7 @@
<caption><location><page_2><loc_22><loc_75><loc_79><loc_84></location>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
<figure>
<location><page_2><loc_24><loc_46><loc_76><loc_74></location>
<caption>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
</figure>
<text><location><page_2><loc_22><loc_34><loc_79><loc_43></location>today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].</text>
<text><location><page_2><loc_22><loc_16><loc_79><loc_34></location>Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTabNet [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR.</text>
@ -27,6 +28,7 @@
<caption><location><page_5><loc_24><loc_71><loc_77><loc_72></location>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
<figure>
<location><page_5><loc_22><loc_57><loc_78><loc_71></location>
<caption>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
</figure>
<text><location><page_5><loc_22><loc_33><loc_79><loc_54></location>Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.</text>
<text><location><page_5><loc_22><loc_27><loc_79><loc_32></location>Additionally, it would be desirable if the representation would easily allow an early detection of invalid sequences on-the-go, before the prediction of the entire table structure is completed. HTML is not well-suited for this purpose as the verification of incomplete sequences is non-trivial or even impossible.</text>
@ -47,6 +49,7 @@
<caption><location><page_7><loc_22><loc_80><loc_79><loc_84></location>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
<figure>
<location><page_7><loc_27><loc_65><loc_73><loc_79></location>
<caption>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
</figure>
<section_header><location><page_7><loc_22><loc_60><loc_40><loc_62></location>4.2 Language Syntax</section_header>
<text><location><page_7><loc_22><loc_58><loc_59><loc_59></location>The OTSL representation follows these syntax rules:</text>
@ -67,16 +70,18 @@
<caption><location><page_8><loc_22><loc_36><loc_79><loc_39></location>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
<figure>
<location><page_8><loc_23><loc_25><loc_77><loc_36></location>
<caption>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
</figure>
<text><location><page_8><loc_22><loc_16><loc_79><loc_22></location>We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in</text>
<text><location><page_9><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</text>
<section_header><location><page_9><loc_22><loc_77><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</section_header>
<text><location><page_9><loc_22><loc_68><loc_79><loc_77></location>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</text>
<caption><location><page_9><loc_23><loc_41><loc_78><loc_57></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
<caption><location><page_9><loc_22><loc_59><loc_79><loc_65></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
<table>
<location><page_9><loc_23><loc_41><loc_78><loc_57></location>
<row_0><col_0><body>#</col_0><col_1><body>#</col_1><col_2><body>Language</col_2><col_3><body>TEDs</col_3><col_4><body>TEDs</col_4><col_5><body>TEDs</col_5><col_6><body>mAP</col_6><col_7><body>Inference</col_7></row_0>
<row_1><col_0><body>enc-layers</col_0><col_1><body>dec-layers</col_1><col_2><body>Language</col_2><col_3><body>simple</col_3><col_4><body>complex</col_4><col_5><body>all</col_5><col_6><body>(0.75)</col_6><col_7><body>time (secs)</col_7></row_1>
<caption>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
<row_0><col_0><col_header>#</col_0><col_1><col_header>#</col_1><col_2><col_header>Language</col_2><col_3><col_header>TEDs</col_3><col_4><col_header>TEDs</col_4><col_5><col_header>TEDs</col_5><col_6><col_header>mAP</col_6><col_7><col_header>Inference</col_7></row_0>
<row_1><col_0><col_header>enc-layers</col_0><col_1><col_header>dec-layers</col_1><col_2><col_header>Language</col_2><col_3><col_header>simple</col_3><col_4><col_header>complex</col_4><col_5><col_header>all</col_5><col_6><col_header>(0.75)</col_6><col_7><col_header>time (secs)</col_7></row_1>
<row_2><col_0><body>6</col_0><col_1><body>6</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.965 0.969</col_3><col_4><body>0.934 0.927</col_4><col_5><body>0.955 0.955</col_5><col_6><body>0.88 0.857</col_6><col_7><body>2.73 5.39</col_7></row_2>
<row_3><col_0><body>4</col_0><col_1><body>4</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.938 0.952</col_3><col_4><body>0.904</col_4><col_5><body>0.927</col_5><col_6><body>0.853</col_6><col_7><body>1.97</col_7></row_3>
<row_4><col_0><body>2</col_0><col_1><body>4</col_1><col_2><body>OTSL</col_2><col_3><body>0.923 0.945</col_3><col_4><body>0.909 0.897</col_4><col_5><body>0.938</col_5><col_6><body>0.843</col_6><col_7><body>3.77</col_7></row_4>
@ -86,29 +91,32 @@
<section_header><location><page_9><loc_22><loc_35><loc_43><loc_36></location>5.2 Quantitative Results</section_header>
<text><location><page_9><loc_22><loc_22><loc_79><loc_34></location>We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.</text>
<text><location><page_9><loc_22><loc_16><loc_79><loc_22></location>Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.</text>
<caption><location><page_10><loc_23><loc_67><loc_77><loc_80></location>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption>
<caption><location><page_10><loc_22><loc_82><loc_79><loc_86></location>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption>
<table>
<location><page_10><loc_23><loc_67><loc_77><loc_80></location>
<row_0><col_0><body></col_0><col_1><body>Language</col_1><col_2><body>TEDs</col_2><col_3><body>TEDs</col_3><col_4><body>TEDs</col_4><col_5><body>mAP(0.75)</col_5><col_6><body>Inference time (secs)</col_6></row_0>
<row_1><col_0><body></col_0><col_1><body>Language</col_1><col_2><body>simple</col_2><col_3><body>complex</col_3><col_4><body>all</col_4><col_5><body>mAP(0.75)</col_5><col_6><body>Inference time (secs)</col_6></row_1>
<row_2><col_0><body>PubTabNet</col_0><col_1><body>OTSL</col_1><col_2><body>0.965</col_2><col_3><body>0.934</col_3><col_4><body>0.955</col_4><col_5><body>0.88</col_5><col_6><body>2.73</col_6></row_2>
<row_3><col_0><body>PubTabNet</col_0><col_1><body>HTML</col_1><col_2><body>0.969</col_2><col_3><body>0.927</col_3><col_4><body>0.955</col_4><col_5><body>0.857</col_5><col_6><body>5.39</col_6></row_3>
<row_4><col_0><body>FinTabNet</col_0><col_1><body>OTSL</col_1><col_2><body>0.955</col_2><col_3><body>0.961</col_3><col_4><body>0.959</col_4><col_5><body>0.862</col_5><col_6><body>1.85</col_6></row_4>
<row_5><col_0><body>FinTabNet</col_0><col_1><body>HTML</col_1><col_2><body>0.917</col_2><col_3><body>0.922</col_3><col_4><body>0.92</col_4><col_5><body>0.722</col_5><col_6><body>3.26</col_6></row_5>
<row_6><col_0><body>PubTables-1M</col_0><col_1><body>OTSL</col_1><col_2><body>0.987</col_2><col_3><body>0.964</col_3><col_4><body>0.977</col_4><col_5><body>0.896</col_5><col_6><body>1.79</col_6></row_6>
<row_7><col_0><body>PubTables-1M</col_0><col_1><body>HTML</col_1><col_2><body>0.983</col_2><col_3><body>0.944</col_3><col_4><body>0.966</col_4><col_5><body>0.889</col_5><col_6><body>3.26</col_6></row_7>
<caption>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption>
<row_0><col_0><body></col_0><col_1><col_header>Language</col_1><col_2><col_header>TEDs</col_2><col_3><col_header>TEDs</col_3><col_4><col_header>TEDs</col_4><col_5><col_header>mAP(0.75)</col_5><col_6><col_header>Inference time (secs)</col_6></row_0>
<row_1><col_0><body></col_0><col_1><col_header>Language</col_1><col_2><col_header>simple</col_2><col_3><col_header>complex</col_3><col_4><col_header>all</col_4><col_5><col_header>mAP(0.75)</col_5><col_6><col_header>Inference time (secs)</col_6></row_1>
<row_2><col_0><row_header>PubTabNet</col_0><col_1><row_header>OTSL</col_1><col_2><body>0.965</col_2><col_3><body>0.934</col_3><col_4><body>0.955</col_4><col_5><body>0.88</col_5><col_6><body>2.73</col_6></row_2>
<row_3><col_0><row_header>PubTabNet</col_0><col_1><row_header>HTML</col_1><col_2><body>0.969</col_2><col_3><body>0.927</col_3><col_4><body>0.955</col_4><col_5><body>0.857</col_5><col_6><body>5.39</col_6></row_3>
<row_4><col_0><row_header>FinTabNet</col_0><col_1><row_header>OTSL</col_1><col_2><body>0.955</col_2><col_3><body>0.961</col_3><col_4><body>0.959</col_4><col_5><body>0.862</col_5><col_6><body>1.85</col_6></row_4>
<row_5><col_0><row_header>FinTabNet</col_0><col_1><row_header>HTML</col_1><col_2><body>0.917</col_2><col_3><body>0.922</col_3><col_4><body>0.92</col_4><col_5><body>0.722</col_5><col_6><body>3.26</col_6></row_5>
<row_6><col_0><row_header>PubTables-1M</col_0><col_1><row_header>OTSL</col_1><col_2><body>0.987</col_2><col_3><body>0.964</col_3><col_4><body>0.977</col_4><col_5><body>0.896</col_5><col_6><body>1.79</col_6></row_6>
<row_7><col_0><row_header>PubTables-1M</col_0><col_1><row_header>HTML</col_1><col_2><body>0.983</col_2><col_3><body>0.944</col_3><col_4><body>0.966</col_4><col_5><body>0.889</col_5><col_6><body>3.26</col_6></row_7>
</table>
<section_header><location><page_10><loc_22><loc_62><loc_42><loc_64></location>5.3 Qualitative Results</section_header>
<text><location><page_10><loc_22><loc_54><loc_79><loc_61></location>To illustrate the qualitative differences between OTSL and HTML, Figure 5 demonstrates less overlap and more accurate bounding boxes with OTSL. In Figure 6, OTSL proves to be more effective in handling tables with longer token sequences, resulting in even more precise structure prediction and bounding boxes.</text>
<caption><location><page_10><loc_22><loc_44><loc_79><loc_50></location>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
<figure>
<location><page_10><loc_27><loc_16><loc_74><loc_44></location>
<caption>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
</figure>
<text><location><page_10><loc_37><loc_15><loc_38><loc_16></location>μ</text>
<text><location><page_10><loc_49><loc_12><loc_49><loc_14></location>≥</text>
<caption><location><page_11><loc_22><loc_77><loc_79><loc_84></location>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
<figure>
<location><page_11><loc_28><loc_20><loc_73><loc_77></location>
<caption>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
</figure>
<section_header><location><page_12><loc_22><loc_84><loc_36><loc_85></location>6 Conclusion</section_header>
<text><location><page_12><loc_22><loc_74><loc_79><loc_82></location>We demonstrated that representing tables in HTML for the task of table structure recognition with Im2Seq models is ill-suited and has serious limitations. Furthermore, we presented in this paper an Optimized Table Structure Language (OTSL) which, when compared to commonly used general purpose languages, has several key benefits.</text>

File diff suppressed because one or more lines are too long

View File

@ -16,6 +16,7 @@ In modern document understanding systems [1,15], table extraction is typically a
Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).
<!-- image -->
today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].
@ -46,6 +47,7 @@ ulary and can be interpreted as a table structure. For example, with the HTML to
Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.
<!-- image -->
Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.
@ -82,6 +84,7 @@ A notable attribute of OTSL is that it has the capability of achieving lossless
Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding
<!-- image -->
## 4.2 Language Syntax
@ -118,6 +121,7 @@ To evaluate the impact of OTSL on prediction accuracy and inference times, we co
Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.
<!-- image -->
We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in
@ -130,6 +134,7 @@ We have chosen the PubTabNet data set to perform HPO, since it includes a highly
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
@ -147,6 +152,7 @@ Additionally, the results show that OTSL has an advantage over HTML when applied
Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).
| | Language | TEDs | TEDs | TEDs | mAP(0.75) | Inference time (secs) |
|--------------|------------|--------|---------|--------|-------------|-------------------------|
| | Language | simple | complex | all | mAP(0.75) | Inference time (secs) |
@ -163,6 +169,7 @@ To illustrate the qualitative differences between OTSL and HTML, Figure 5 demons
Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ
<!-- image -->
μ
@ -171,6 +178,7 @@ Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (
Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.
<!-- image -->
## 6 Conclusion

View File

@ -68,53 +68,53 @@
</table>
<table>
<location><page_6><loc_22><loc_7><loc_89><loc_91></location>
<row_0><col_0><body>Chapter 4. Implementing Row and Column Access Control: Banking example . . . . .</col_0><col_1><body>37</col_1></row_0>
<row_1><col_0><body>4.1 Business requirements for the RCAC banking scenario . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>38</col_1></row_1>
<row_2><col_0><body>4.2 Description of the users roles and responsibilities</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . . . . . . 39</col_1></row_2>
<row_3><col_0><body>4.3 Implementation of RCAC . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>42</col_1></row_3>
<row_4><col_0><body>4.3.1 Reviewing the tables that are used in this example</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . 42</col_1></row_4>
<row_5><col_0><body>4.3.2 Assigning function ID QIBM_DB_SECADM to the Database Engineers group</col_0><col_1><body>. . 47</col_1></row_5>
<row_6><col_0><body>4.3.3 Creating group profiles for the users and their roles . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>50</col_1></row_6>
<row_7><col_0><body>4.3.4 Creating the CUSTOMER_LOGIN_ID global variable</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . 52</col_1></row_7>
<row_8><col_0><body>4.3.5 Defining and creating row permissions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>54</col_1></row_8>
<row_9><col_0><body>4.3.6 Defining and creating column masks</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 58</col_1></row_9>
<row_10><col_0><body>4.3.7 Restricting the inserting and updating of masked data . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>60</col_1></row_10>
<row_11><col_0><body>4.3.9 Reviewing row permissions. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>64</col_1></row_11>
<row_12><col_0><body>4.3.10 Demonstrating data access with RCAC . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>66</col_1></row_12>
<row_13><col_0><body>4.3.11 Query implementation with RCAC activated . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>75</col_1></row_13>
<row_14><col_0><body>Chapter 5. RCAC and non-SQL interfaces . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>79</col_1></row_14>
<row_0><col_0><row_header>Chapter 4. Implementing Row and Column Access Control: Banking example . . . . .</col_0><col_1><body>37</col_1></row_0>
<row_1><col_0><row_header>4.1 Business requirements for the RCAC banking scenario . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>38</col_1></row_1>
<row_2><col_0><row_header>4.2 Description of the users roles and responsibilities</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . . . . . . 39</col_1></row_2>
<row_3><col_0><row_header>4.3 Implementation of RCAC . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>42</col_1></row_3>
<row_4><col_0><row_header>4.3.1 Reviewing the tables that are used in this example</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . 42</col_1></row_4>
<row_5><col_0><row_header>4.3.2 Assigning function ID QIBM_DB_SECADM to the Database Engineers group</col_0><col_1><body>. . 47</col_1></row_5>
<row_6><col_0><row_header>4.3.3 Creating group profiles for the users and their roles . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>50</col_1></row_6>
<row_7><col_0><row_header>4.3.4 Creating the CUSTOMER_LOGIN_ID global variable</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . 52</col_1></row_7>
<row_8><col_0><row_header>4.3.5 Defining and creating row permissions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>54</col_1></row_8>
<row_9><col_0><row_header>4.3.6 Defining and creating column masks</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 58</col_1></row_9>
<row_10><col_0><row_header>4.3.7 Restricting the inserting and updating of masked data . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>60</col_1></row_10>
<row_11><col_0><row_header>4.3.9 Reviewing row permissions. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>64</col_1></row_11>
<row_12><col_0><row_header>4.3.10 Demonstrating data access with RCAC . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>66</col_1></row_12>
<row_13><col_0><row_header>4.3.11 Query implementation with RCAC activated . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>75</col_1></row_13>
<row_14><col_0><row_header>Chapter 5. RCAC and non-SQL interfaces . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>79</col_1></row_14>
<row_15><col_0><body></col_0><col_1><body>Unsupported interfaces . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 80</col_1></row_15>
<row_16><col_0><body>5.3 Accidental updates with masked values . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>81</col_1></row_16>
<row_17><col_0><body>5.4 System CL commands considerations . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>82</col_1></row_17>
<row_18><col_0><body>5.4.1 Create Duplicate Object (CRTDUPOBJ) command . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>82</col_1></row_18>
<row_16><col_0><row_header>5.3 Accidental updates with masked values . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>81</col_1></row_16>
<row_17><col_0><row_header>5.4 System CL commands considerations . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>82</col_1></row_17>
<row_18><col_0><row_header>5.4.1 Create Duplicate Object (CRTDUPOBJ) command . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>82</col_1></row_18>
<row_19><col_0><body></col_0><col_1><body>82</col_1></row_19>
<row_20><col_0><body>5.4.2 Copy File (CPYF) command . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5.4.3 Copy Library (CPYLIB) command. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>100</col_1></row_20>
<row_20><col_0><row_header>5.4.2 Copy File (CPYF) command . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5.4.3 Copy Library (CPYLIB) command. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>100</col_1></row_20>
<row_21><col_0><body></col_0><col_1><body>83</col_1></row_21>
<row_22><col_0><body>Chapter 6. Additional considerations . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . . 89 90</col_1></row_22>
<row_23><col_0><body>6.2 RCAC effects on data movement . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>6.2 RCAC effects on data movement . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_1></row_23>
<row_24><col_0><body>6.2.1 Effects when RCAC is defined on the source table</col_0><col_1><body>6.2.1 Effects when RCAC is defined on the source table</col_1></row_24>
<row_22><col_0><row_header>Chapter 6. Additional considerations . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>. . . . . . . . . . . . . . . . . . . . . . . . 89 90</col_1></row_22>
<row_23><col_0><row_header>6.2 RCAC effects on data movement . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><row_header>6.2 RCAC effects on data movement . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_1></row_23>
<row_24><col_0><row_header>6.2.1 Effects when RCAC is defined on the source table</col_0><col_1><row_header>6.2.1 Effects when RCAC is defined on the source table</col_1></row_24>
<row_25><col_0><body></col_0><col_1><body>88</col_1></row_25>
<row_26><col_0><body>6.2.3 Effects when RCAC is defined on both source and target tables . . . . . . . . . . . . . 6.3 RCAC effects on joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>91</col_1></row_26>
<row_27><col_0><body>6.3.1 Inner joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 92</col_0><col_1><body>6.3.1 Inner joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 92</col_1></row_27>
<row_28><col_0><body>6.3.2 Outer joins. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 94</col_0><col_1><body>6.3.2 Outer joins. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 94</col_1></row_28>
<row_29><col_0><body>6.3.3 Exception joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>96</col_1></row_29>
<row_30><col_0><body>6.4 Monitoring, analyzing, and debugging with RCAC</col_0><col_1><body>97</col_1></row_30>
<row_31><col_0><body>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>Query monitoring and analysis tools . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 97</col_1></row_31>
<row_32><col_0><body>6.4.2 Index advisor. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>99</col_1></row_32>
<row_33><col_0><body>6.4.3 Metadata using catalogs . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>6.4.3 Metadata using catalogs . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_1></row_33>
<row_34><col_0><body>6.5 Views, materialized query tables, and query rewrite with RCAC . . . . . . . . . . . . . . . .</col_0><col_1><body>102</col_1></row_34>
<row_35><col_0><body>6.5.2 Materialized query tables . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>103</col_1></row_35>
<row_26><col_0><row_header>6.2.3 Effects when RCAC is defined on both source and target tables . . . . . . . . . . . . . 6.3 RCAC effects on joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>91</col_1></row_26>
<row_27><col_0><row_header>6.3.1 Inner joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 92</col_0><col_1><row_header>6.3.1 Inner joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 92</col_1></row_27>
<row_28><col_0><row_header>6.3.2 Outer joins. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 94</col_0><col_1><row_header>6.3.2 Outer joins. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 94</col_1></row_28>
<row_29><col_0><row_header>6.3.3 Exception joins . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>96</col_1></row_29>
<row_30><col_0><row_header>6.4 Monitoring, analyzing, and debugging with RCAC</col_0><col_1><body>97</col_1></row_30>
<row_31><col_0><row_header>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>Query monitoring and analysis tools . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 97</col_1></row_31>
<row_32><col_0><row_header>6.4.2 Index advisor. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>99</col_1></row_32>
<row_33><col_0><row_header>6.4.3 Metadata using catalogs . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><row_header>6.4.3 Metadata using catalogs . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_1></row_33>
<row_34><col_0><row_header>6.5 Views, materialized query tables, and query rewrite with RCAC . . . . . . . . . . . . . . . .</col_0><col_1><body>102</col_1></row_34>
<row_35><col_0><row_header>6.5.2 Materialized query tables . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>103</col_1></row_35>
<row_36><col_0><body></col_0><col_1><body>105</col_1></row_36>
<row_37><col_0><body>6.5.3 Query rewrite . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . RCAC effects on performance and scalability. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>6.5.3 Query rewrite . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . RCAC effects on performance and scalability. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_1></row_37>
<row_38><col_0><body>6.6</col_0><col_1><body>105</col_1></row_38>
<row_37><col_0><row_header>6.5.3 Query rewrite . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . RCAC effects on performance and scalability. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><row_header>6.5.3 Query rewrite . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . RCAC effects on performance and scalability. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_1></row_37>
<row_38><col_0><row_header>6.6</col_0><col_1><body>105</col_1></row_38>
<row_39><col_0><body></col_0><col_1><body>107</col_1></row_39>
<row_40><col_0><body>6.7 Exclusive lock to implement RCAC (availability issues) . . . . . . . . . . . . . . . . . . . . . . . 6.8 Avoiding propagation of masked data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>108</col_1></row_40>
<row_40><col_0><row_header>6.7 Exclusive lock to implement RCAC (availability issues) . . . . . . . . . . . . . . . . . . . . . . . 6.8 Avoiding propagation of masked data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>108</col_1></row_40>
<row_41><col_0><body></col_0><col_1><body>108</col_1></row_41>
<row_42><col_0><body></col_0><col_1><body>109</col_1></row_42>
<row_43><col_0><body></col_0><col_1><body>109</col_1></row_43>
<row_44><col_0><body></col_0><col_1><body>113</col_1></row_44>
<row_45><col_0><body></col_0><col_1><body>111</col_1></row_45>
<row_46><col_0><body>Chapter 7. Row and Column Access Control management . . . . . . . . . . . . . . . . . . . .</col_0><col_1><body>Chapter 7. Row and Column Access Control management . . . . . . . . . . . . . . . . . . . .</col_1></row_46>
<row_46><col_0><row_header>Chapter 7. Row and Column Access Control management . . . . . . . . . . . . . . . . . . . .</col_0><col_1><row_header>Chapter 7. Row and Column Access Control management . . . . . . . . . . . . . . . . . . . .</col_1></row_46>
</table>
<table>
<location><page_7><loc_22><loc_61><loc_90><loc_91></location>
@ -325,6 +325,7 @@
<caption><location><page_19><loc_22><loc_10><loc_58><loc_11></location>Figure 1-1 All-or-nothing access to the rows of a table</caption>
<figure>
<location><page_19><loc_22><loc_12><loc_80><loc_37></location>
<caption>Figure 1-1 All-or-nothing access to the rows of a table</caption>
</figure>
<text><location><page_20><loc_22><loc_81><loc_89><loc_91></location>Many businesses are trying to limit data access to a need-to-know basis. This security goal means that users should be given access only to the minimum set of data that is required to perform their job. Often, users with object-level access are given access to row and column values that are beyond what their business task requires because that object-level security provides an all-or-nothing solution. For example, object-level controls allow a manager to access data about all employees. Most security policies limit a manager to accessing data only for the employees that they manage.</text>
<section_header><location><page_20><loc_11><loc_77><loc_49><loc_78></location>1.3.1 Existing row and column control</section_header>
@ -334,6 +335,7 @@
<caption><location><page_20><loc_22><loc_12><loc_52><loc_13></location>Figure 1-2 Existing row and column controls</caption>
<figure>
<location><page_20><loc_22><loc_13><loc_89><loc_53></location>
<caption>Figure 1-2 Existing row and column controls</caption>
</figure>
<section_header><location><page_21><loc_11><loc_89><loc_64><loc_91></location>1.3.2 New controls: Row and Column Access Control</section_header>
<text><location><page_21><loc_22><loc_82><loc_88><loc_88></location>Based on the challenges that are associated with the existing technology available for controlling row and column access at a more granular level, IBM delivered new security support in the IBM i 7.2 release; this support is known as Row and Column Access Control (RCAC).</text>
@ -390,10 +392,11 @@
<text><location><page_26><loc_22><loc_75><loc_72><loc_76></location>CHGFCNUSG FCNID(QIBM_DB_SECADM) USER(HBEDOYA) USAGE(*ALLOWED)</text>
<section_header><location><page_26><loc_10><loc_71><loc_89><loc_72></location>2.1.7 Verifying function usage IDs for RCAC with the FUNCTION_USAGE view</section_header>
<text><location><page_26><loc_22><loc_66><loc_85><loc_69></location>The FUNCTION_USAGE view contains function usage configuration details. Table 2-1 describes the columns in the FUNCTION_USAGE view.</text>
<caption><location><page_26><loc_22><loc_44><loc_89><loc_63></location>Table 2-1 FUNCTION_USAGE view</caption>
<caption><location><page_26><loc_22><loc_64><loc_47><loc_65></location>Table 2-1 FUNCTION_USAGE view</caption>
<table>
<location><page_26><loc_22><loc_44><loc_89><loc_63></location>
<row_0><col_0><body>Column name</col_0><col_1><body>Data type</col_1><col_2><body>Description</col_2></row_0>
<caption>Table 2-1 FUNCTION_USAGE view</caption>
<row_0><col_0><col_header>Column name</col_0><col_1><col_header>Data type</col_1><col_2><col_header>Description</col_2></row_0>
<row_1><col_0><body>FUNCTION_ID</col_0><col_1><body>VARCHAR(30)</col_1><col_2><body>ID of the function.</col_2></row_1>
<row_2><col_0><body>USER_NAME</col_0><col_1><body>VARCHAR(10)</col_1><col_2><body>Name of the user profile that has a usage setting for this function.</col_2></row_2>
<row_3><col_0><body>USAGE</col_0><col_1><body>VARCHAR(7)</col_1><col_2><body>Usage setting: GLYPH<SM590000> ALLOWED: The user profile is allowed to use the function. GLYPH<SM590000> DENIED: The user profile is not allowed to use the function.</col_2></row_3>
@ -410,47 +413,48 @@
<text><location><page_27><loc_22><loc_65><loc_89><loc_69></location>QIBM_DB_SECADM also is responsible for administering RCAC, which restricts which rows a user is allowed to access in a table and whether a user is allowed to see information in certain columns of a table.</text>
<text><location><page_27><loc_22><loc_57><loc_88><loc_63></location>A preferred practice is that the RCAC administrator has the QIBM_DB_SECADM function usage ID, but absolutely no other data privileges. The result is that the RCAC administrator can deploy and maintain the RCAC constructs, but cannot grant themselves unauthorized access to data itself.</text>
<text><location><page_27><loc_22><loc_53><loc_89><loc_56></location>Table 2-2 shows a comparison of the different function usage IDs and *JOBCTL authority to the different CL commands and DB2 for i tools.</text>
<caption><location><page_27><loc_11><loc_9><loc_89><loc_50></location>Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority</caption>
<caption><location><page_27><loc_11><loc_50><loc_64><loc_52></location>Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority</caption>
<table>
<location><page_27><loc_11><loc_9><loc_89><loc_50></location>
<row_0><col_0><body>User action</col_0><col_1><body>*JOBCTL</col_1><col_2><body>QIBM_DB_SECADM</col_2><col_3><body>QIBM_DB_SQLADM</col_3><col_4><body>QIBM_DB_SYSMON</col_4><col_5><body>No Authority</col_5></row_0>
<row_1><col_0><body>SET CURRENT DEGREE (SQL statement)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_1>
<row_2><col_0><body>CHGQRYA command targeting a different user's job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_2>
<row_3><col_0><body>STRDBMON or ENDDBMON commands targeting a different user's job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_3>
<row_4><col_0><body>STRDBMON or ENDDBMON commands targeting a job that matches the current user</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body>X</col_5></row_4>
<row_5><col_0><body>QUSRJOBI() API format 900 or System i Navigator's SQL Details for Job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body></col_5></row_5>
<row_6><col_0><body>Visual Explain within Run SQL scripts</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body>X</col_5></row_6>
<row_7><col_0><body>Visual Explain outside of Run SQL scripts</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_7>
<row_8><col_0><body>ANALYZE PLAN CACHE procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_8>
<row_9><col_0><body>DUMP PLAN CACHE procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_9>
<row_10><col_0><body>MODIFY PLAN CACHE procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_10>
<row_11><col_0><body>MODIFY PLAN CACHE PROPERTIES procedure (currently does not check authority)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_11>
<row_12><col_0><body>CHANGE PLAN CACHE SIZE procedure (currently does not check authority)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_12>
<caption>Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority</caption>
<row_0><col_0><row_header>User action</col_0><col_1><body>*JOBCTL</col_1><col_2><body>QIBM_DB_SECADM</col_2><col_3><body>QIBM_DB_SQLADM</col_3><col_4><body>QIBM_DB_SYSMON</col_4><col_5><body>No Authority</col_5></row_0>
<row_1><col_0><row_header>SET CURRENT DEGREE (SQL statement)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_1>
<row_2><col_0><row_header>CHGQRYA command targeting a different user's job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_2>
<row_3><col_0><row_header>STRDBMON or ENDDBMON commands targeting a different user's job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_3>
<row_4><col_0><row_header>STRDBMON or ENDDBMON commands targeting a job that matches the current user</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body>X</col_5></row_4>
<row_5><col_0><row_header>QUSRJOBI() API format 900 or System i Navigator's SQL Details for Job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body></col_5></row_5>
<row_6><col_0><row_header>Visual Explain within Run SQL scripts</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body>X</col_5></row_6>
<row_7><col_0><row_header>Visual Explain outside of Run SQL scripts</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_7>
<row_8><col_0><row_header>ANALYZE PLAN CACHE procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_8>
<row_9><col_0><row_header>DUMP PLAN CACHE procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_9>
<row_10><col_0><row_header>MODIFY PLAN CACHE procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_10>
<row_11><col_0><row_header>MODIFY PLAN CACHE PROPERTIES procedure (currently does not check authority)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_11>
<row_12><col_0><row_header>CHANGE PLAN CACHE SIZE procedure (currently does not check authority)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_12>
</table>
<table>
<location><page_28><loc_10><loc_28><loc_89><loc_91></location>
<row_0><col_0><body>User action</col_0><col_1><body>*JOBCTL</col_1><col_2><body>QIBM_DB_SECADM</col_2><col_3><body>QIBM_DB_SQLADM</col_3><col_4><body>QIBM_DB_SYSMON</col_4><col_5><body>No Authority</col_5></row_0>
<row_1><col_0><body>START PLAN CACHE EVENT MONITOR procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_1>
<row_2><col_0><body>END PLAN CACHE EVENT MONITOR procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_2>
<row_3><col_0><body>END ALL PLAN CACHE EVENT MONITORS procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_3>
<row_4><col_0><body>Work with RCAC row permissions (Create, modify, or delete)</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_4>
<row_5><col_0><body>Work with RCAC column masks (Create, modify, or delete)</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_5>
<row_6><col_0><body>Change Object Owner ( CHGOBJOWN ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_6>
<row_7><col_0><body>Change Object Primary Group ( CHGOBJPGP ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_7>
<row_8><col_0><body>Grant Object Authority ( GRTOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_8>
<row_9><col_0><body>Revoke Object Authority ( RVKOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_9>
<row_10><col_0><body>Edit Object Authority ( EDTOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_10>
<row_11><col_0><body>Display Object Authority ( DSPOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_11>
<row_12><col_0><body>Work with Objects ( WRKOBJ ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_12>
<row_13><col_0><body>Work with Libraries ( WRKLIB ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_13>
<row_14><col_0><body>Add Authorization List Entry ( ADDAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_14>
<row_15><col_0><body>Change Authorization List Entry ( CHGAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_15>
<row_16><col_0><body>Remove Authorization List Entry ( RMVAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_16>
<row_17><col_0><body>Retrieve Authorization List Entry ( RTVAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_17>
<row_18><col_0><body>Display Authorization List ( DSPAUTL ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_18>
<row_19><col_0><body>Display Authorization List Objects ( DSPAUTLOBJ ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_19>
<row_20><col_0><body>Edit Authorization List ( EDTAUTL ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_20>
<row_21><col_0><body>Work with Authorization Lists ( WRKAUTL ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_21>
<row_0><col_0><body>User action</col_0><col_1><col_header>*JOBCTL</col_1><col_2><col_header>QIBM_DB_SECADM</col_2><col_3><col_header>QIBM_DB_SQLADM</col_3><col_4><col_header>QIBM_DB_SYSMON</col_4><col_5><col_header>No Authority</col_5></row_0>
<row_1><col_0><row_header>START PLAN CACHE EVENT MONITOR procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_1>
<row_2><col_0><row_header>END PLAN CACHE EVENT MONITOR procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_2>
<row_3><col_0><row_header>END ALL PLAN CACHE EVENT MONITORS procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_3>
<row_4><col_0><row_header>Work with RCAC row permissions (Create, modify, or delete)</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_4>
<row_5><col_0><row_header>Work with RCAC column masks (Create, modify, or delete)</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_5>
<row_6><col_0><row_header>Change Object Owner ( CHGOBJOWN ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_6>
<row_7><col_0><row_header>Change Object Primary Group ( CHGOBJPGP ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_7>
<row_8><col_0><row_header>Grant Object Authority ( GRTOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_8>
<row_9><col_0><row_header>Revoke Object Authority ( RVKOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_9>
<row_10><col_0><row_header>Edit Object Authority ( EDTOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_10>
<row_11><col_0><row_header>Display Object Authority ( DSPOBJAUT ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_11>
<row_12><col_0><row_header>Work with Objects ( WRKOBJ ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_12>
<row_13><col_0><row_header>Work with Libraries ( WRKLIB ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_13>
<row_14><col_0><row_header>Add Authorization List Entry ( ADDAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_14>
<row_15><col_0><row_header>Change Authorization List Entry ( CHGAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_15>
<row_16><col_0><row_header>Remove Authorization List Entry ( RMVAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_16>
<row_17><col_0><row_header>Retrieve Authorization List Entry ( RTVAUTLE ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_17>
<row_18><col_0><row_header>Display Authorization List ( DSPAUTL ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_18>
<row_19><col_0><row_header>Display Authorization List Objects ( DSPAUTLOBJ ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_19>
<row_20><col_0><row_header>Edit Authorization List ( EDTAUTL ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_20>
<row_21><col_0><row_header>Work with Authorization Lists ( WRKAUTL ) CL command</col_0><col_1><body></col_1><col_2><body>X</col_2><col_3><body></col_3><col_4><body></col_4><col_5><body></col_5></row_21>
</table>
<figure>
<location><page_29><loc_5><loc_70><loc_39><loc_91></location>
@ -483,6 +487,7 @@
<caption><location><page_31><loc_22><loc_47><loc_56><loc_48></location>Figure 3-1 CREATE PERMISSION SQL statement</caption>
<figure>
<location><page_31><loc_22><loc_48><loc_89><loc_86></location>
<caption>The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.Figure 3-1 CREATE PERMISSION SQL statement</caption>
</figure>
<section_header><location><page_31><loc_22><loc_43><loc_35><loc_45></location>Column mask</section_header>
<text><location><page_31><loc_22><loc_37><loc_89><loc_43></location>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</text>
@ -490,6 +495,7 @@
<caption><location><page_32><loc_22><loc_46><loc_51><loc_47></location>Figure 3-2 CREATE MASK SQL statement</caption>
<figure>
<location><page_32><loc_22><loc_48><loc_89><loc_85></location>
<caption>Figure 3-2 CREATE MASK SQL statement</caption>
</figure>
<section_header><location><page_32><loc_10><loc_42><loc_47><loc_44></location>3.1.2 Enabling and activating RCAC</section_header>
<text><location><page_32><loc_22><loc_36><loc_89><loc_40></location>You can enable, disable, or regenerate row permissions and column masks by using the SQL ALTER PERMISSION statement and the SQL ALTER MASK statement, as shown in Figure 3-3 on page 17.</text>
@ -498,12 +504,14 @@
<caption><location><page_33><loc_22><loc_55><loc_68><loc_56></location>Figure 3-3 ALTER PERMISSION and ALTER MASK SQL statements</caption>
<figure>
<location><page_33><loc_22><loc_56><loc_83><loc_90></location>
<caption>Figure 3-3 ALTER PERMISSION and ALTER MASK SQL statements</caption>
</figure>
<text><location><page_33><loc_22><loc_45><loc_89><loc_53></location>You can activate and deactivate RCAC for new or existing tables by using the SQL ALTER TABLE statement (Figure 3-4). The ACTIVATE or DEACTIVATE clause must be the option that is specified in the statement. No other alterations are permitted at the same time. The activating and deactivating effectively turns on or off all RCAC processing for the table. Only enabled row permissions and column masks take effect when activating RCAC.</text>
<text><location><page_33><loc_23><loc_40><loc_87><loc_43></location>Note: An exclusive lock is required on the table object to perform the alter operation. All open cursors must be closed.</text>
<caption><location><page_33><loc_22><loc_7><loc_50><loc_9></location>Figure 3-4 ALTER TABLE SQL statement</caption>
<figure>
<location><page_33><loc_22><loc_9><loc_84><loc_37></location>
<caption>Figure 3-4 ALTER TABLE SQL statement</caption>
</figure>
<text><location><page_34><loc_22><loc_81><loc_89><loc_91></location>When row access control is activated on a table, a default permission is established for that table. The name of this permission is QIBM_DEFAULT_ <table-name>_<schema-name>. This default permission contains a simple piece of logic (0=1) which is never true. The default permission effectively denies access to every user unless there is a permission defined that allows access explicitly. If row access control is activated on a table, and there is no permission that is defined, no one has permission to any rows. All queries against the table produce an empty set.</text>
<text><location><page_34><loc_22><loc_73><loc_89><loc_79></location>It is possible to define, create, and enable multiple permissions on a table. Logically, all of the permissions are ORed together to form a comprehensive test of the user's ability to access the data. A column can have only one mask that is defined over it. From an implementation standpoint, it does not matter if you create the column masks first or the row permissions first.</text>
@ -519,10 +527,11 @@
<list_item><location><page_34><loc_22><loc_12><loc_89><loc_19></location>GLYPH<SM590000> SYSTEM_USER is the user profile that initiates the connection to the server. It is not used by RCAC, but is included here for completeness. Many jobs, including the QZDASOINIT prestarted jobs, initially connect to the server with a default user profile and then change to use some other user profile. SYSTEM_USER reports this value, typically QUSER for a QZDASOINIT job. It has a data type of VARCHAR(128).</list_item>
<text><location><page_34><loc_22><loc_7><loc_89><loc_10></location>In addition to these four special registers, any of the DB2 special registers can be referenced as part of the rule text.</text>
<paragraph><location><page_35><loc_22><loc_90><loc_67><loc_91></location>Table 3-1 summarizes these special registers and their values.</paragraph>
<caption><location><page_35><loc_22><loc_74><loc_89><loc_87></location>Table 3-1 Special registers and their corresponding values</caption>
<caption><location><page_35><loc_22><loc_87><loc_61><loc_88></location>Table 3-1 Special registers and their corresponding values</caption>
<table>
<location><page_35><loc_22><loc_74><loc_89><loc_87></location>
<row_0><col_0><body>Special register</col_0><col_1><body>Corresponding value</col_1></row_0>
<caption>Table 3-1 Special registers and their corresponding values</caption>
<row_0><col_0><col_header>Special register</col_0><col_1><col_header>Corresponding value</col_1></row_0>
<row_1><col_0><body>USER or SESSION_USER</col_0><col_1><body>The effective user of the thread excluding adopted authority.</col_1></row_1>
<row_2><col_0><body>CURRENT_USER</col_0><col_1><body>The effective user of the thread including adopted authority. When no adopted authority is present, this has the same value as USER.</col_1></row_2>
<row_3><col_0><body>SYSTEM_USER</col_0><col_1><body>The authorization ID that initiated the connection.</col_1></row_3>
@ -536,15 +545,17 @@
<caption><location><page_35><loc_22><loc_24><loc_56><loc_25></location>Figure 3-5 Special registers and adopted authority</caption>
<figure>
<location><page_35><loc_22><loc_25><loc_49><loc_51></location>
<caption>Figure 3-5 Special registers and adopted authority</caption>
</figure>
<section_header><location><page_35><loc_10><loc_19><loc_40><loc_21></location>3.2.2 Built-in global variables</section_header>
<text><location><page_35><loc_22><loc_15><loc_85><loc_18></location>Built-in global variables are provided with the database manager and are used in SQL statements to retrieve scalar values that are associated with the variables.</text>
<text><location><page_35><loc_22><loc_9><loc_87><loc_14></location>IBM DB2 for i supports nine different built-in global variables that are read only and maintained by the system. These global variables can be used to identify attributes of the database connection and used as part of the RCAC logic.</text>
<text><location><page_36><loc_22><loc_90><loc_56><loc_91></location>Table 3-2 lists the nine built-in global variables.</text>
<caption><location><page_36><loc_10><loc_63><loc_90><loc_87></location>Table 3-2 Built-in global variables</caption>
<caption><location><page_36><loc_11><loc_87><loc_33><loc_88></location>Table 3-2 Built-in global variables</caption>
<table>
<location><page_36><loc_10><loc_63><loc_90><loc_87></location>
<row_0><col_0><body>Global variable</col_0><col_1><body>Type</col_1><col_2><body>Description</col_2></row_0>
<caption>Table 3-2 Built-in global variables</caption>
<row_0><col_0><col_header>Global variable</col_0><col_1><col_header>Type</col_1><col_2><col_header>Description</col_2></row_0>
<row_1><col_0><body>CLIENT_HOST</col_0><col_1><body>VARCHAR(255)</col_1><col_2><body>Host name of the current client as returned by the system</col_2></row_1>
<row_2><col_0><body>CLIENT_IPADDR</col_0><col_1><body>VARCHAR(128)</col_1><col_2><body>IP address of the current client as returned by the system</col_2></row_2>
<row_3><col_0><body>CLIENT_PORT</col_0><col_1><body>INTEGER</col_1><col_2><body>Port used by the current client to communicate with the server</col_2></row_3>
@ -624,6 +635,7 @@
<caption><location><page_40><loc_22><loc_51><loc_45><loc_53></location>Figure 3-7 Number of employees</caption>
<figure>
<location><page_40><loc_22><loc_53><loc_37><loc_57></location>
<caption>Figure 3-7 Number of employees</caption>
</figure>
<list_item><location><page_40><loc_22><loc_46><loc_89><loc_49></location>2. Run a second SQL statement (shown in Example 3-6) that lists the employees. If you have read access to the table, you see all the rows no matter who you are.</list_item>
<paragraph><location><page_40><loc_22><loc_44><loc_60><loc_45></location>Example 3-6 Displaying the information of the Employees</paragraph>
@ -644,6 +656,7 @@
<caption><location><page_42><loc_11><loc_37><loc_54><loc_38></location>Figure 3-9 Row permissions that are shown in System i Navigator</caption>
<figure>
<location><page_42><loc_10><loc_38><loc_89><loc_62></location>
<caption>Figure 3-9 Row permissions that are shown in System i Navigator</caption>
</figure>
<section_header><location><page_42><loc_10><loc_33><loc_53><loc_34></location>3.6.5 Defining and creating column masks</section_header>
<text><location><page_42><loc_22><loc_28><loc_86><loc_31></location>Define the different masks for the columns that are sensitive by completing the following steps:</text>
@ -668,6 +681,7 @@
<caption><location><page_44><loc_10><loc_77><loc_48><loc_78></location>Figure 3-10 Column masks shown in System i Navigator</caption>
<figure>
<location><page_44><loc_10><loc_79><loc_89><loc_88></location>
<caption>Figure 3-10 Column masks shown in System i Navigator</caption>
</figure>
<section_header><location><page_44><loc_11><loc_73><loc_33><loc_75></location>3.6.6 Activating RCAC</section_header>
<text><location><page_44><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</text>
@ -680,11 +694,13 @@
<caption><location><page_44><loc_11><loc_17><loc_57><loc_18></location>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
<figure>
<location><page_44><loc_10><loc_18><loc_87><loc_46></location>
<caption>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
</figure>
<list_item><location><page_45><loc_22><loc_88><loc_87><loc_91></location>3. The EMPLOYEES table definition is displayed, as shown in Figure 3-12. Note that the Row access control and Column access control options are checked.</list_item>
<caption><location><page_45><loc_22><loc_56><loc_58><loc_57></location>Figure 3-12 RCAC enabled on the EMPLOYEES table</caption>
<figure>
<location><page_45><loc_22><loc_57><loc_89><loc_87></location>
<caption>Figure 3-12 RCAC enabled on the EMPLOYEES table</caption>
</figure>
<section_header><location><page_45><loc_10><loc_52><loc_56><loc_53></location>3.6.7 Demonstrating data access with RCAC</section_header>
<text><location><page_45><loc_22><loc_47><loc_89><loc_50></location>You are now ready to start testing RCAC with the four different users. Complete the following steps:</text>
@ -695,21 +711,25 @@
<caption><location><page_45><loc_22><loc_25><loc_51><loc_26></location>Figure 3-13 Count of EMPLOYEES by HR</caption>
<figure>
<location><page_45><loc_22><loc_26><loc_37><loc_31></location>
<caption>Figure 3-13 Count of EMPLOYEES by HR</caption>
</figure>
<list_item><location><page_45><loc_22><loc_18><loc_88><loc_23></location>3. The result of the same query for a user who is logged on as TQSPENSER (Manager) is shown in Figure 3-14. TQSPENSER has five employees in his department and he can also see his own row, which is why the count is 6.</list_item>
<caption><location><page_45><loc_22><loc_11><loc_56><loc_12></location>Figure 3-14 Count of EMPLOYEES by a manager</caption>
<figure>
<location><page_45><loc_22><loc_12><loc_36><loc_16></location>
<caption>Figure 3-14 Count of EMPLOYEES by a manager</caption>
</figure>
<list_item><location><page_46><loc_22><loc_88><loc_89><loc_91></location>4. The result of the same query that is run by an employee (DSSMITH) gives the result that is shown in Figure 3-15. Each employee can see only his or her own data (row).</list_item>
<caption><location><page_46><loc_22><loc_81><loc_57><loc_82></location>Figure 3-15 Count of EMPLOYEES by an employee</caption>
<figure>
<location><page_46><loc_22><loc_82><loc_37><loc_86></location>
<caption>Figure 3-15 Count of EMPLOYEES by an employee</caption>
</figure>
<list_item><location><page_46><loc_22><loc_74><loc_89><loc_78></location>5. The result of the same query that is run by the Consultant/DBE gives the result that is shown in Figure 3-16. The consultants/DBE can manage and implement RCAC, but they do not see any rows at all.</list_item>
<caption><location><page_46><loc_22><loc_66><loc_56><loc_68></location>Figure 3-16 Count of EMPLOYEES by a consultant</caption>
<figure>
<location><page_46><loc_22><loc_68><loc_37><loc_72></location>
<caption>Figure 3-16 Count of EMPLOYEES by a consultant</caption>
</figure>
<text><location><page_46><loc_25><loc_63><loc_75><loc_64></location>Does the result make sense? Yes, it does because RCAC is enabled.</text>
<list_item><location><page_46><loc_22><loc_58><loc_89><loc_62></location>6. Run queries against the EMPLOYEES table. The query that is used in this example runs and tests with the four different user profiles and is the same query that was run in 3.6.3, "Demonstrating data access without RCAC" on page 24. It is shown in Example 3-12.</list_item>
@ -777,6 +797,7 @@
<caption><location><page_55><loc_22><loc_37><loc_47><loc_38></location>Figure 4-1 Internet banking example</caption>
<figure>
<location><page_55><loc_22><loc_38><loc_81><loc_71></location>
<caption>Figure 4-1 Internet banking example</caption>
</figure>
<section_header><location><page_55><loc_11><loc_32><loc_78><loc_34></location>4.2 Description of the users roles and responsibilities</section_header>
<text><location><page_55><loc_22><loc_27><loc_87><loc_30></location>During the requirements gathering phase, the following groups of users are identified and codified:</text>
@ -790,12 +811,14 @@
<caption><location><page_56><loc_22><loc_47><loc_52><loc_48></location>Figure 4-2 Rules for row and column access</caption>
<figure>
<location><page_56><loc_22><loc_49><loc_86><loc_85></location>
<caption>Figure 4-2 Rules for row and column access</caption>
</figure>
<text><location><page_57><loc_22><loc_88><loc_89><loc_91></location>The chart that is shown in Figure 4-3 shows the column access that is allowed by group and lists the column masks by table.</text>
<caption><location><page_57><loc_22><loc_50><loc_86><loc_86></location>Figure 4-3 Column masks</caption>
<caption><location><page_57><loc_22><loc_48><loc_40><loc_49></location>Figure 4-3 Column masks</caption>
<table>
<location><page_57><loc_22><loc_50><loc_86><loc_86></location>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><body>CUSTOMERS</col_2><col_3><body>ACCOUNTS</col_3></row_0>
<caption>Figure 4-3 Column masks</caption>
<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>CUSTOMERS</col_2><col_3><col_header>ACCOUNTS</col_3></row_0>
<row_1><col_0><body>SECURITY</col_0><col_1><body>No Rows</col_1><col_2><body>CUSTOMER_DRIVERS_LICENSE_NUMBER CUSTOMER_EMAIL CUSTOMER_LOGIN_ID CUSTOMER_SECURITY_QUESTION CUSTOMER_SECURITY_QUESTION_ANSWER CUSTOMER_TAX_ID</col_2><col_3><body>ACCOUNT_NUMBER</col_3></row_1>
<row_2><col_0><body>DBE</col_0><col_1><body>All Rows</col_1><col_2><body>CUSTOMER_DRIVERS_LICENSE_NUMBER CUSTOMER_EMAIL CUSTOMER_LOGIN_ID CUSTOMER_SECURITY_QUESTION CUSTOMER_SECURITY_QUESTION_ANSWER CUSTOMER_TAX_ID</col_2><col_3><body>ACCOUNT NUMBER ACCOUNT_NUMBER</col_3></row_2>
<row_3><col_0><body>ADMIN</col_0><col_1><body>All Rows</col_1><col_2><body>None</col_2><col_3><body>None</col_3></row_3>
@ -816,6 +839,7 @@
<caption><location><page_58><loc_22><loc_60><loc_53><loc_62></location>Figure 4-4 Data model of the banking scenario</caption>
<figure>
<location><page_58><loc_22><loc_62><loc_74><loc_84></location>
<caption>Figure 4-4 Data model of the banking scenario</caption>
</figure>
<text><location><page_58><loc_22><loc_57><loc_51><loc_58></location>This section covers the following steps:</text>
<list_item><location><page_58><loc_22><loc_55><loc_61><loc_56></location>GLYPH<SM590000> Reviewing the tables that are used in this example</list_item>
@ -834,6 +858,7 @@
<caption><location><page_58><loc_22><loc_19><loc_59><loc_20></location>Figure 4-5 Tables that are used in the banking example</caption>
<figure>
<location><page_58><loc_22><loc_21><loc_68><loc_28></location>
<caption>Figure 4-5 Tables that are used in the banking example</caption>
</figure>
<text><location><page_58><loc_23><loc_11><loc_86><loc_16></location>Note: Appendix A, "Database definitions for the RCAC banking example" on page 121 provides a script that you can use to create all the database definitions or DDLs to re-create this RCAC example.</text>
<text><location><page_59><loc_22><loc_88><loc_86><loc_91></location>To review the attributes of each table that is used in this banking example, complete the following steps:</text>
@ -842,6 +867,7 @@
<caption><location><page_59><loc_22><loc_53><loc_50><loc_54></location>Figure 4-6 CUSTOMERS table attributes</caption>
<figure>
<location><page_59><loc_22><loc_54><loc_68><loc_78></location>
<caption>Figure 4-6 CUSTOMERS table attributes</caption>
</figure>
<list_item><location><page_59><loc_22><loc_48><loc_85><loc_51></location>3. Click the Columns tab to see the columns of the CUSTOMERS table, as shown in Figure 4-7.</list_item>
<paragraph><location><page_59><loc_11><loc_21><loc_48><loc_23></location>Figure 4-7 Column definitions of the CUSTOMERS table</paragraph>
@ -849,11 +875,13 @@
<caption><location><page_60><loc_22><loc_60><loc_65><loc_61></location>Figure 4-8 Reviewing the constraints on the CUSTOMERS table</caption>
<figure>
<location><page_60><loc_22><loc_61><loc_87><loc_83></location>
<caption>Figure 4-8 Reviewing the constraints on the CUSTOMERS table</caption>
</figure>
<list_item><location><page_60><loc_22><loc_55><loc_89><loc_58></location>5. Review the definition of the ACCOUNTS table. The definition of the ACCOUNTS table is shown in Figure 4-9. RCAC has not been defined for this table yet.</list_item>
<caption><location><page_60><loc_22><loc_28><loc_49><loc_29></location>Figure 4-9 ACCOUNTS table attributes</caption>
<figure>
<location><page_60><loc_22><loc_29><loc_75><loc_53></location>
<caption>Figure 4-9 ACCOUNTS table attributes</caption>
</figure>
<list_item><location><page_61><loc_22><loc_88><loc_83><loc_91></location>6. Click the Columns tab to see the columns of the ACCOUNTS table, as shown in Figure 4-10.</list_item>
<paragraph><location><page_61><loc_10><loc_69><loc_48><loc_70></location>Figure 4-10 Column definitions of the ACCOUNTS table</paragraph>
@ -861,11 +889,13 @@
<caption><location><page_61><loc_11><loc_37><loc_53><loc_38></location>Figure 4-11 Reviewing the constraints on the ACCOUNTS table</caption>
<figure>
<location><page_61><loc_10><loc_39><loc_89><loc_59></location>
<caption>Figure 4-11 Reviewing the constraints on the ACCOUNTS table</caption>
</figure>
<list_item><location><page_62><loc_22><loc_88><loc_89><loc_91></location>8. Review the definition of the TRANSACTIONS table. The definition of the TRANSACTIONS table is shown in Figure 4-12. RCAC is not defined for this table yet.</list_item>
<caption><location><page_62><loc_22><loc_61><loc_53><loc_62></location>Figure 4-12 TRANSACTIONS table attributes</caption>
<figure>
<location><page_62><loc_22><loc_63><loc_74><loc_86></location>
<caption>Figure 4-12 TRANSACTIONS table attributes</caption>
</figure>
<list_item><location><page_62><loc_22><loc_56><loc_87><loc_59></location>9. Click the Columns tab to see the columns of the TRANSACTIONS table, as shown in Figure 4-13.</list_item>
<paragraph><location><page_62><loc_22><loc_38><loc_63><loc_39></location>Figure 4-13 Column definitions of the TRANSACTIONS table</paragraph>
@ -873,6 +903,7 @@
<caption><location><page_62><loc_11><loc_7><loc_56><loc_9></location>Figure 4-14 Reviewing the constraints on the TRANSACTIONS table</caption>
<figure>
<location><page_62><loc_10><loc_9><loc_89><loc_28></location>
<caption>Figure 4-14 Reviewing the constraints on the TRANSACTIONS table</caption>
</figure>
<text><location><page_63><loc_22><loc_88><loc_86><loc_91></location>Now that you have reviewed the database model for this example, the following sections describe the steps that are required to implement RCAC in this banking scenario.</text>
<section_header><location><page_63><loc_10><loc_82><loc_87><loc_86></location>4.3.2 Assigning function ID QIBM_DB_SECADM to the Database Engineers group</section_header>
@ -882,31 +913,37 @@
<caption><location><page_63><loc_22><loc_37><loc_48><loc_38></location>Figure 4-15 Application administration</caption>
<figure>
<location><page_63><loc_22><loc_38><loc_56><loc_68></location>
<caption>Figure 4-15 Application administration</caption>
</figure>
<list_item><location><page_64><loc_22><loc_88><loc_87><loc_91></location>2. The Application Administration window opens, as shown in Figure 4-16. Click IBM i  Database and select the function usage ID of Database Security Administrator .</list_item>
<caption><location><page_64><loc_22><loc_49><loc_54><loc_51></location>Figure 4-16 Application administration for IBM i</caption>
<figure>
<location><page_64><loc_22><loc_51><loc_86><loc_86></location>
<caption>Figure 4-16 Application administration for IBM i</caption>
</figure>
<list_item><location><page_64><loc_22><loc_44><loc_89><loc_47></location>3. Click Customize for the function usage ID of Database Security Administrator, as shown in Figure 4-17.</list_item>
<caption><location><page_64><loc_22><loc_21><loc_75><loc_23></location>Figure 4-17 Customizing the Database Security Administrator function usage ID</caption>
<figure>
<location><page_64><loc_22><loc_23><loc_86><loc_43></location>
<caption>Figure 4-17 Customizing the Database Security Administrator function usage ID</caption>
</figure>
<list_item><location><page_65><loc_22><loc_87><loc_89><loc_91></location>4. The Customize Access window opens, as shown in Figure 4-18. Click the users that need to implement RCAC. For this example, HBEDOYA and MCAIN are selected. Click Add and then click OK .</list_item>
<caption><location><page_65><loc_22><loc_48><loc_49><loc_49></location>Figure 4-18 Customize Access window</caption>
<figure>
<location><page_65><loc_22><loc_49><loc_79><loc_85></location>
<caption>Figure 4-18 Customize Access window</caption>
</figure>
<list_item><location><page_65><loc_22><loc_41><loc_88><loc_46></location>5. The Application Administrator window opens again. The function usage ID of Database Security Administrator now has an X in the Customized Access column, as shown in Figure 4-19.</list_item>
<caption><location><page_65><loc_22><loc_25><loc_73><loc_26></location>Figure 4-19 Function usage ID Database Security Administrator customized</caption>
<figure>
<location><page_65><loc_22><loc_26><loc_79><loc_41></location>
<caption>Figure 4-19 Function usage ID Database Security Administrator customized</caption>
</figure>
<list_item><location><page_66><loc_22><loc_88><loc_89><loc_91></location>6. Run an SQL query that shows which user profiles are enabled to define RCAC. The SQL query is shown in Figure 4-20.</list_item>
<caption><location><page_66><loc_22><loc_64><loc_72><loc_65></location>Figure 4-20 Query to display user profiles with function usage ID for RCAC</caption>
<figure>
<location><page_66><loc_22><loc_66><loc_73><loc_86></location>
<caption>Figure 4-20 Query to display user profiles with function usage ID for RCAC</caption>
</figure>
<section_header><location><page_66><loc_10><loc_60><loc_68><loc_62></location>4.3.3 Creating group profiles for the users and their roles</section_header>
<text><location><page_66><loc_22><loc_52><loc_89><loc_58></location>The next step is to create the different group profiles (ADMIN, CUSTOMER, TELLER, and DBE) and assign the different user profiles to the different group profiles. For a description of the different groups and users for this example, see 4.2, "Description of the users roles and responsibilities" on page 39.</text>
@ -915,17 +952,20 @@
<caption><location><page_66><loc_22><loc_24><loc_46><loc_25></location>Figure 4-21 Creating group profiles</caption>
<figure>
<location><page_66><loc_22><loc_26><loc_59><loc_44></location>
<caption>Figure 4-21 Creating group profiles</caption>
</figure>
<list_item><location><page_67><loc_22><loc_87><loc_89><loc_91></location>2. The New Group window opens, as shown in Figure 4-22. For each new group, enter the Group name (ADMIN, CUSTOMER, TELLER, and DBE) and add the user profiles that are associated to this group by selecting the user profile and clicking Add .</list_item>
<text><location><page_67><loc_25><loc_85><loc_79><loc_86></location>Figure 4-22 shows adding user TQSPENCER to the TELLER group profile.</text>
<caption><location><page_67><loc_22><loc_53><loc_58><loc_54></location>Figure 4-22 Creating group profiles and adding users</caption>
<figure>
<location><page_67><loc_22><loc_54><loc_77><loc_83></location>
<caption>Figure 4-22 Creating group profiles and adding users</caption>
</figure>
<list_item><location><page_67><loc_22><loc_48><loc_88><loc_51></location>3. After you create all the group profiles, you should see them listed in System i Navigator under Users and Groups  Groups , as shown in Figure 4-23.</list_item>
<caption><location><page_67><loc_22><loc_29><loc_50><loc_30></location>Figure 4-23 Newly created group profiles</caption>
<figure>
<location><page_67><loc_22><loc_31><loc_44><loc_46></location>
<caption>Figure 4-23 Newly created group profiles</caption>
</figure>
<section_header><location><page_68><loc_10><loc_89><loc_69><loc_91></location>4.3.4 Creating the CUSTOMER_LOGIN_ID global variable</section_header>
<text><location><page_68><loc_22><loc_83><loc_89><loc_88></location>In this step, you create a global variable that is used to capture the Customer_Login_ID information, which is required to validate the permissions. For more information about global variables, see 3.2.2, "Built-in global variables" on page 19.</text>
@ -934,21 +974,25 @@
<caption><location><page_68><loc_22><loc_47><loc_48><loc_49></location>Figure 4-24 Creating a global variable</caption>
<figure>
<location><page_68><loc_22><loc_49><loc_74><loc_75></location>
<caption>Figure 4-24 Creating a global variable</caption>
</figure>
<list_item><location><page_68><loc_22><loc_39><loc_88><loc_45></location>2. The New Global Variable window opens, as shown in Figure 4-25. Enter the global variable name of CUSTOMER_LOGIN_ID, select the data type of VARCHAR, and leave the default value of NULL. This default value ensures that users that do not use the web interface do not have permission to access the data. Click OK .</list_item>
<caption><location><page_68><loc_11><loc_9><loc_57><loc_10></location>Figure 4-25 Creating a global variable called CUSTOMER_LOGIN_ID</caption>
<figure>
<location><page_68><loc_10><loc_11><loc_89><loc_38></location>
<caption>Figure 4-25 Creating a global variable called CUSTOMER_LOGIN_ID</caption>
</figure>
<list_item><location><page_69><loc_22><loc_87><loc_89><loc_91></location>3. Now that the global variable is created, assign permissions to the variable so that it can be set by the program. Right-click the CUSTOMER_LOGIN_ID global variable and select Permissions , as shown in Figure 4-26.</list_item>
<caption><location><page_69><loc_22><loc_68><loc_75><loc_69></location>Figure 4-26 Setting permissions on the CUSTOMER_LOGIN_ID global variable</caption>
<figure>
<location><page_69><loc_22><loc_70><loc_57><loc_85></location>
<caption>Figure 4-26 Setting permissions on the CUSTOMER_LOGIN_ID global variable</caption>
</figure>
<list_item><location><page_69><loc_22><loc_63><loc_87><loc_66></location>4. The Permissions window opens, as shown in Figure 4-27. Select Change authority for Webuser so that the application can set this global variable.</list_item>
<caption><location><page_69><loc_22><loc_26><loc_88><loc_27></location>Figure 4-27 Setting change permissions for Webuser on the CUSTOMER_LOGIN_ID global variable</caption>
<figure>
<location><page_69><loc_22><loc_27><loc_89><loc_62></location>
<caption>Figure 4-27 Setting change permissions for Webuser on the CUSTOMER_LOGIN_ID global variable</caption>
</figure>
<section_header><location><page_70><loc_10><loc_89><loc_56><loc_91></location>4.3.5 Defining and creating row permissions</section_header>
<text><location><page_70><loc_22><loc_86><loc_86><loc_88></location>You now ready to define the row permissions of the tables. Complete the following steps:</text>
@ -956,6 +1000,7 @@
<caption><location><page_70><loc_22><loc_45><loc_51><loc_47></location>Figure 4-28 Selecting new row permissions</caption>
<figure>
<location><page_70><loc_22><loc_47><loc_76><loc_80></location>
<caption>Figure 4-28 Selecting new row permissions</caption>
</figure>
<list_item><location><page_71><loc_22><loc_87><loc_89><loc_91></location>2. The New Row Permission window opens, as shown in Figure 4-29. Enter the information regarding the row permissions on the CUSTOMERS table. This row permission defines what is established in the following policy:</list_item>
<list_item><location><page_71><loc_25><loc_83><loc_89><loc_86></location>-User profiles that belong to DBE, ADMIN, and TELLER group profiles can see all the rows.</list_item>
@ -965,6 +1010,7 @@
<caption><location><page_71><loc_22><loc_32><loc_63><loc_33></location>Figure 4-29 New row permissions on the CUSTOMERS table</caption>
<figure>
<location><page_71><loc_22><loc_34><loc_83><loc_67></location>
<caption>Figure 4-29 New row permissions on the CUSTOMERS table</caption>
</figure>
<list_item><location><page_72><loc_22><loc_85><loc_89><loc_91></location>3. Define the row permissions for the ACCOUNTS table. The New Row Permission window opens, as shown in Figure 4-30. Enter the information regarding the row permissions on the ACCOUNTS table. This row permission defines what is established in the following policy:</list_item>
<list_item><location><page_72><loc_25><loc_82><loc_88><loc_84></location>-User profiles that belong to DBE, ADMIN and TELLER group profiles can see all the rows.</list_item>
@ -974,6 +1020,7 @@
<caption><location><page_72><loc_11><loc_25><loc_50><loc_26></location>Figure 4-30 New row permissions on the ACCOUNTS table</caption>
<figure>
<location><page_72><loc_10><loc_26><loc_88><loc_66></location>
<caption>Figure 4-30 New row permissions on the ACCOUNTS table</caption>
</figure>
<list_item><location><page_73><loc_22><loc_85><loc_87><loc_91></location>4. Define the row permissions on the TRANSACTIONS table. The New Row Permission window opens, as shown in Figure 4-31. Enter the information regarding the row permissions on the TRANSACTIONS table. This row permission defines what is established in the following policy:</list_item>
<list_item><location><page_73><loc_25><loc_82><loc_89><loc_84></location>-User profiles that belong to DBE, ADMIN, and TELLER group profiles can see all of the rows.</list_item>
@ -984,11 +1031,13 @@
<caption><location><page_73><loc_10><loc_9><loc_53><loc_10></location>Figure 4-31 New row permissions on the TRANSACTIONS table</caption>
<figure>
<location><page_73><loc_10><loc_10><loc_88><loc_56></location>
<caption>Figure 4-31 New row permissions on the TRANSACTIONS table</caption>
</figure>
<list_item><location><page_74><loc_22><loc_87><loc_85><loc_91></location>5. To verify that the row permissions are enabled, from System i Navigator, click Row Permissions , as shown in Figure 4-32. The three row permissions are created and enabled.</list_item>
<caption><location><page_74><loc_11><loc_64><loc_48><loc_65></location>Figure 4-32 List of row permissions on BANK_SCHEMA</caption>
<figure>
<location><page_74><loc_10><loc_66><loc_89><loc_85></location>
<caption>Figure 4-32 List of row permissions on BANK_SCHEMA</caption>
</figure>
<section_header><location><page_74><loc_10><loc_60><loc_53><loc_62></location>4.3.6 Defining and creating column masks</section_header>
<text><location><page_74><loc_22><loc_57><loc_78><loc_58></location>This section defines the masks on the columns. Complete the following steps:</text>
@ -996,6 +1045,7 @@
<caption><location><page_74><loc_22><loc_27><loc_47><loc_28></location>Figure 4-33 Creating a column mask</caption>
<figure>
<location><page_74><loc_22><loc_28><loc_75><loc_50></location>
<caption>Figure 4-33 Creating a column mask</caption>
</figure>
<list_item><location><page_75><loc_22><loc_88><loc_86><loc_91></location>2. In the New Column Mask window, which is shown in Figure 4-34, enter the following information:</list_item>
<list_item><location><page_75><loc_25><loc_86><loc_76><loc_87></location>-Select the CUSTOMERS table on which to create the column mask.</list_item>
@ -1005,6 +1055,7 @@
<caption><location><page_75><loc_22><loc_22><loc_65><loc_24></location>Figure 4-34 Defining a column mask on the CUSTOMERS table</caption>
<figure>
<location><page_75><loc_22><loc_24><loc_87><loc_76></location>
<caption>Figure 4-34 Defining a column mask on the CUSTOMERS table</caption>
</figure>
<list_item><location><page_75><loc_22><loc_19><loc_85><loc_20></location>3. Repeat steps 1 on page 58 and 2 to create column masks for the following columns:</list_item>
<list_item><location><page_75><loc_25><loc_17><loc_62><loc_18></location>-MASK_DRIVERS_LICENSE_ON_CUSTOMERS</list_item>
@ -1017,6 +1068,7 @@
<caption><location><page_76><loc_10><loc_76><loc_47><loc_77></location>Figure 4-35 List of column masks on BANK_SCHEMA</caption>
<figure>
<location><page_76><loc_10><loc_77><loc_89><loc_86></location>
<caption>Figure 4-35 List of column masks on BANK_SCHEMA</caption>
</figure>
<section_header><location><page_76><loc_10><loc_72><loc_71><loc_73></location>4.3.7 Restricting the inserting and updating of masked data</section_header>
<text><location><page_76><loc_22><loc_64><loc_89><loc_70></location>This step defines the check constraints that support the column masks to make sure that on INSERTS or UPDATES, data is not written with a masked value. For more information about the propagation of masked data, see 6.8, "Avoiding propagation of masked data" on page 108.</text>
@ -1025,11 +1077,13 @@
<caption><location><page_76><loc_22><loc_39><loc_55><loc_40></location>Figure 4-36 Definition of the CUSTOMERS table</caption>
<figure>
<location><page_76><loc_22><loc_41><loc_89><loc_54></location>
<caption>Figure 4-36 Definition of the CUSTOMERS table</caption>
</figure>
<list_item><location><page_76><loc_22><loc_34><loc_89><loc_37></location>2. From the CUSTOMERS definition window, click the Check Constraints tab and click Add , as shown in Figure 4-37.</list_item>
<caption><location><page_76><loc_10><loc_23><loc_36><loc_24></location>Figure 4-37 Adding a check constraint</caption>
<figure>
<location><page_76><loc_10><loc_24><loc_89><loc_33></location>
<caption>Figure 4-37 Adding a check constraint</caption>
</figure>
<list_item><location><page_77><loc_22><loc_88><loc_84><loc_91></location>3. The New Check Constraint window opens, as shown in Figure 4-38. Complete the following steps:</list_item>
<list_item><location><page_77><loc_25><loc_86><loc_56><loc_87></location>a. Select the CUSTOMER_EMAIL column.</list_item>
@ -1038,16 +1092,19 @@
<caption><location><page_77><loc_11><loc_16><loc_59><loc_18></location>Figure 4-38 Specifying a new check constraint on the CUSTOMERS table</caption>
<figure>
<location><page_77><loc_10><loc_18><loc_89><loc_79></location>
<caption>Figure 4-38 Specifying a new check constraint on the CUSTOMERS table</caption>
</figure>
<list_item><location><page_78><loc_22><loc_88><loc_88><loc_91></location>4. Figure 4-39 shows that there is now a check constraint on the CUSTOMERS table that prevents any masked data from being updated to the CUSTOMER_EMAIL column.</list_item>
<caption><location><page_78><loc_10><loc_50><loc_48><loc_51></location>Figure 4-39 Check constraint on the CUSTOMERS table</caption>
<figure>
<location><page_78><loc_10><loc_52><loc_89><loc_86></location>
<caption>Figure 4-39 Check constraint on the CUSTOMERS table</caption>
</figure>
<list_item><location><page_78><loc_22><loc_44><loc_89><loc_48></location>5. Create all the other check constraints that are associated to each of the masks on the CUSTOMERS table. After this is done, these constraints should look like the ones that are shown in Figure 4-40.</list_item>
<caption><location><page_78><loc_10><loc_23><loc_53><loc_24></location>Figure 4-40 List of check constraints on the CUSTOMERS table</caption>
<figure>
<location><page_78><loc_10><loc_24><loc_89><loc_42></location>
<caption>Figure 4-40 List of check constraints on the CUSTOMERS table</caption>
</figure>
<section_header><location><page_79><loc_10><loc_89><loc_59><loc_91></location>4.3.8 Activating row and column access control</section_header>
<text><location><page_79><loc_22><loc_85><loc_84><loc_88></location>You are now ready to activate RCAC on all three tables in this example. Complete the following steps:</text>
@ -1055,16 +1112,19 @@
<caption><location><page_79><loc_22><loc_56><loc_59><loc_58></location>Figure 4-41 Enabling RCAC on the CUSTOMERS table</caption>
<figure>
<location><page_79><loc_22><loc_58><loc_89><loc_78></location>
<caption>Figure 4-41 Enabling RCAC on the CUSTOMERS table</caption>
</figure>
<list_item><location><page_79><loc_22><loc_50><loc_88><loc_54></location>2. Enable RCAC on the ACCOUNTS table. Right-click the ACCOUNTS table and select Definition . As shown Figure 4-42, make sure that you select Row access control and Column access control . Click OK .</list_item>
<caption><location><page_79><loc_22><loc_26><loc_52><loc_27></location>Figure 4-42 Enabling RCAC on ACCOUNTS</caption>
<figure>
<location><page_79><loc_22><loc_28><loc_87><loc_48></location>
<caption>Figure 4-42 Enabling RCAC on ACCOUNTS</caption>
</figure>
<list_item><location><page_80><loc_22><loc_87><loc_89><loc_91></location>3. Enable RCAC on the TRANSACTIONS table. Right-click the TRANSACTIONS table and select Definition . As shown in Figure 4-43, make sure that you select Row access control . Click OK .</list_item>
<caption><location><page_80><loc_22><loc_63><loc_55><loc_65></location>Figure 4-43 Enabling RCAC on TRANSACTIONS</caption>
<figure>
<location><page_80><loc_22><loc_65><loc_89><loc_85></location>
<caption>Figure 4-43 Enabling RCAC on TRANSACTIONS</caption>
</figure>
<section_header><location><page_80><loc_10><loc_59><loc_44><loc_61></location>4.3.9 Reviewing row permissions</section_header>
<text><location><page_80><loc_22><loc_55><loc_88><loc_58></location>This section displays all the row permissions after enabling RCAC. Complete the following steps:</text>
@ -1072,16 +1132,19 @@
<caption><location><page_80><loc_11><loc_25><loc_44><loc_26></location>Figure 4-44 Row permissions after enabling RCAC</caption>
<figure>
<location><page_80><loc_10><loc_27><loc_89><loc_48></location>
<caption>Figure 4-44 Row permissions after enabling RCAC</caption>
</figure>
<list_item><location><page_81><loc_22><loc_88><loc_89><loc_91></location>2. Look at one of the row permission definitions by right-clicking it and selecting Definition , as shown in Figure 4-45.</list_item>
<caption><location><page_81><loc_22><loc_68><loc_54><loc_70></location>Figure 4-45 Selecting row permission definition</caption>
<figure>
<location><page_81><loc_22><loc_70><loc_76><loc_86></location>
<caption>Figure 4-45 Selecting row permission definition</caption>
</figure>
<list_item><location><page_81><loc_22><loc_60><loc_89><loc_66></location>3. A window opens, as shown in Figure 4-46. Take note of the nonsensical search condition (0=1) of the QIBM_DEFAULT row permission. This permission is ORed with all of the others and it ensures that if someone does not meet any of the criteria from the row permission then this condition is tested, and because it is false the access is denied.</list_item>
<caption><location><page_81><loc_11><loc_23><loc_56><loc_24></location>Figure 4-46 Search condition of the QIBM_DEFAULT row permission</caption>
<figure>
<location><page_81><loc_10><loc_25><loc_82><loc_59></location>
<caption>Figure 4-46 Search condition of the QIBM_DEFAULT row permission</caption>
</figure>
<section_header><location><page_82><loc_10><loc_89><loc_57><loc_91></location>4.3.10 Demonstrating data access with RCAC</section_header>
<text><location><page_82><loc_22><loc_85><loc_89><loc_88></location>You are now ready to test the RCAC definitions. Run the following SQL statements with each type of user (DBE, SECURITY, TELLER, ADMIN, and WEBUSER):</text>
@ -1099,11 +1162,13 @@
<caption><location><page_82><loc_22><loc_43><loc_43><loc_44></location>Figure 4-47 DBE session user</caption>
<figure>
<location><page_82><loc_22><loc_45><loc_59><loc_57></location>
<caption>Figure 4-47 DBE session user</caption>
</figure>
<list_item><location><page_82><loc_22><loc_40><loc_82><loc_41></location>2. The number of rows that the DBE user MCAIN can see is shown in Figure 4-48.</list_item>
<caption><location><page_82><loc_22><loc_20><loc_74><loc_21></location>Figure 4-48 Number of rows that DBE user can see in the CUSTOMERS table</caption>
<figure>
<location><page_82><loc_22><loc_21><loc_64><loc_38></location>
<caption>Figure 4-48 Number of rows that DBE user can see in the CUSTOMERS table</caption>
</figure>
<list_item><location><page_83><loc_22><loc_87><loc_89><loc_91></location>3. The result of the third SQL statement is shown in Figure 4-49. Note the masked columns. User MCAIN can see all the rows in the CUSTOMERS table, but there are some columns where the result is masked.</list_item>
<paragraph><location><page_83><loc_11><loc_39><loc_62><loc_41></location>Figure 4-49 SQL statement that is run by the DBE user with masked columns</paragraph>
@ -1113,16 +1178,19 @@
<caption><location><page_83><loc_22><loc_12><loc_47><loc_13></location>Figure 4-50 SECURITY session user</caption>
<figure>
<location><page_83><loc_22><loc_13><loc_58><loc_29></location>
<caption>Figure 4-50 SECURITY session user</caption>
</figure>
<list_item><location><page_84><loc_22><loc_88><loc_89><loc_91></location>2. The number of rows in the CUSTOMERS table that the security officer can see is shown in Figure 4-51. The security officer cannot see any data at all.</list_item>
<caption><location><page_84><loc_22><loc_70><loc_80><loc_71></location>Figure 4-51 Number of rows that the security officer can see in the CUSTOMERS table</caption>
<figure>
<location><page_84><loc_22><loc_72><loc_63><loc_86></location>
<caption>Figure 4-51 Number of rows that the security officer can see in the CUSTOMERS table</caption>
</figure>
<list_item><location><page_84><loc_22><loc_65><loc_89><loc_68></location>3. The result of the third SQL statement is shown in Figure 4-52. Note the empty set that is returned to the security officer.</list_item>
<caption><location><page_84><loc_10><loc_43><loc_59><loc_44></location>Figure 4-52 SQL statement that is run by the SECURITY user - no results</caption>
<figure>
<location><page_84><loc_10><loc_45><loc_89><loc_63></location>
<caption>Figure 4-52 SQL statement that is run by the SECURITY user - no results</caption>
</figure>
<section_header><location><page_84><loc_22><loc_40><loc_60><loc_41></location>Data access for TELLER user with RCAC</section_header>
<text><location><page_84><loc_22><loc_38><loc_70><loc_39></location>To test a Teller (TQSPENCER) user, complete the following steps:</text>
@ -1130,16 +1198,19 @@
<caption><location><page_84><loc_22><loc_14><loc_45><loc_15></location>Figure 4-53 TELLER session user</caption>
<figure>
<location><page_84><loc_22><loc_15><loc_59><loc_32></location>
<caption>Figure 4-53 TELLER session user</caption>
</figure>
<list_item><location><page_85><loc_22><loc_88><loc_89><loc_91></location>2. The number of rows in the CUSTOMERS table that the TELLER user can see is shown in Figure 4-54. The TELLER user can see all the rows.</list_item>
<caption><location><page_85><loc_22><loc_71><loc_79><loc_72></location>Figure 4-54 Number of rows that the TELLER user can see in the CUSTOMERS table</caption>
<figure>
<location><page_85><loc_22><loc_72><loc_64><loc_86></location>
<caption>Figure 4-54 Number of rows that the TELLER user can see in the CUSTOMERS table</caption>
</figure>
<list_item><location><page_85><loc_22><loc_64><loc_89><loc_69></location>3. The result of the third SQL statement is shown in Figure 4-55. Note the masked columns. The TELLER user, TQSPENSER, can see all the rows, but there are some columns where the result is masked.</list_item>
<caption><location><page_85><loc_10><loc_14><loc_64><loc_15></location>Figure 4-55 SQL statement that is run by the TELLER user with masked columns</caption>
<figure>
<location><page_85><loc_11><loc_15><loc_90><loc_63></location>
<caption>Figure 4-55 SQL statement that is run by the TELLER user with masked columns</caption>
</figure>
<section_header><location><page_86><loc_22><loc_90><loc_59><loc_91></location>Data access for ADMIN user with RCAC</section_header>
<text><location><page_86><loc_22><loc_88><loc_73><loc_89></location>To test an ADMIN (VGLUCCHESS) user, complete the following steps:</text>
@ -1147,11 +1218,13 @@
<caption><location><page_86><loc_22><loc_70><loc_45><loc_72></location>Figure 4-56 ADMIN session user</caption>
<figure>
<location><page_86><loc_22><loc_72><loc_58><loc_82></location>
<caption>Figure 4-56 ADMIN session user</caption>
</figure>
<list_item><location><page_86><loc_22><loc_66><loc_88><loc_68></location>2. The number of rows that the ADMIN user can see is shown in Figure 4-57. The ADMIN user can see all the rows.</list_item>
<caption><location><page_86><loc_22><loc_52><loc_75><loc_53></location>Figure 4-57 Number of rows that the ADMIN can see in the CUSTOMERS table</caption>
<figure>
<location><page_86><loc_22><loc_53><loc_63><loc_64></location>
<caption>Figure 4-57 Number of rows that the ADMIN can see in the CUSTOMERS table</caption>
</figure>
<list_item><location><page_87><loc_22><loc_88><loc_86><loc_91></location>3. The result of the third SQL statement is shown in Figure 4-58. There are no masked columns.</list_item>
<paragraph><location><page_87><loc_11><loc_38><loc_63><loc_40></location>Figure 4-58 SQL statement that is run by the ADMIN user - no masked columns</paragraph>
@ -1161,26 +1234,31 @@
<caption><location><page_87><loc_22><loc_13><loc_47><loc_15></location>Figure 4-59 WEBUSER session user</caption>
<figure>
<location><page_87><loc_22><loc_15><loc_58><loc_26></location>
<caption>Figure 4-59 WEBUSER session user</caption>
</figure>
<list_item><location><page_88><loc_22><loc_87><loc_89><loc_91></location>2. A global variable (CUSTOMER_LOGIN_ID) is set by the web application and then is used to check the row permissions. Figure 4-60 shows setting the global variable by using the customer login ID.</list_item>
<caption><location><page_88><loc_22><loc_71><loc_64><loc_72></location>Figure 4-60 Setting the global variable CUSTOMER_LOGIN_ID</caption>
<figure>
<location><page_88><loc_22><loc_72><loc_89><loc_85></location>
<caption>Figure 4-60 Setting the global variable CUSTOMER_LOGIN_ID</caption>
</figure>
<list_item><location><page_88><loc_22><loc_66><loc_84><loc_69></location>3. Verify that the global variable was set with the correct value by clicking the Global Variable tab, as shown in Figure 4-61.</list_item>
<caption><location><page_88><loc_22><loc_33><loc_52><loc_34></location>Figure 4-61 Viewing the global variable value</caption>
<figure>
<location><page_88><loc_22><loc_34><loc_65><loc_64></location>
<caption>Figure 4-61 Viewing the global variable value</caption>
</figure>
<list_item><location><page_88><loc_22><loc_28><loc_88><loc_31></location>4. The number of rows that the WEBUSER can see is shown in Figure 4-62. This user can see only the one row that belongs to his web-based user ID.</list_item>
<caption><location><page_88><loc_22><loc_12><loc_77><loc_13></location>Figure 4-62 Number of rows that the WEBUSER can see in the CUSTOMERS table</caption>
<figure>
<location><page_88><loc_22><loc_14><loc_62><loc_26></location>
<caption>Figure 4-62 Number of rows that the WEBUSER can see in the CUSTOMERS table</caption>
</figure>
<list_item><location><page_89><loc_22><loc_88><loc_86><loc_91></location>5. The result of the third SQL statement is shown in Figure 4-63. There are no masked columns, and the user can see only one row, which is the user's own row.</list_item>
<caption><location><page_89><loc_10><loc_67><loc_60><loc_68></location>Figure 4-63 SQL statement that is run by WEBUSER - no masked columns</caption>
<figure>
<location><page_89><loc_10><loc_68><loc_89><loc_86></location>
<caption>Figure 4-63 SQL statement that is run by WEBUSER - no masked columns</caption>
</figure>
<section_header><location><page_89><loc_22><loc_63><loc_62><loc_65></location>Other examples of data access with RCAC</section_header>
<text><location><page_89><loc_22><loc_60><loc_84><loc_63></location>To run an SQL statement that lists all the accounts and current balance by customer, complete the following steps:</text>
@ -1197,21 +1275,25 @@
<caption><location><page_92><loc_22><loc_36><loc_56><loc_37></location>Figure 4-67 Visual Explain with no RCAC enabled</caption>
<figure>
<location><page_92><loc_22><loc_37><loc_65><loc_83></location>
<caption>Figure 4-67 Visual Explain with no RCAC enabled</caption>
</figure>
<list_item><location><page_93><loc_22><loc_87><loc_84><loc_91></location>2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</list_item>
<caption><location><page_93><loc_22><loc_38><loc_54><loc_39></location>Figure 4-68 Visual Explain with RCAC enabled</caption>
<figure>
<location><page_93><loc_22><loc_40><loc_89><loc_85></location>
<caption>Figure 4-68 Visual Explain with RCAC enabled</caption>
</figure>
<list_item><location><page_93><loc_22><loc_32><loc_89><loc_36></location>3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</list_item>
<caption><location><page_93><loc_11><loc_15><loc_37><loc_16></location>Figure 4-69 Index advice with no RCAC</caption>
<figure>
<location><page_93><loc_11><loc_16><loc_83><loc_30></location>
<caption>Figure 4-69 Index advice with no RCAC</caption>
</figure>
<list_item><location><page_94><loc_22><loc_87><loc_89><loc_91></location>4. Now, look at the advised indexes with RCAC enabled. As shown in Figure 4-70, there is an additional index being advised, which is basically for the row permission rule. For more information, see 6.4.2, "Index advisor" on page 99.</list_item>
<caption><location><page_94><loc_10><loc_69><loc_41><loc_70></location>Figure 4-70 Index advice with RCAC enabled</caption>
<figure>
<location><page_94><loc_10><loc_70><loc_83><loc_85></location>
<caption>Figure 4-70 Index advice with RCAC enabled</caption>
</figure>
<figure>
<location><page_95><loc_5><loc_70><loc_39><loc_91></location>
@ -1261,6 +1343,7 @@
<caption><location><page_97><loc_22><loc_8><loc_61><loc_9></location>Figure 5-1 Accidental update with masked values scenario</caption>
<figure>
<location><page_97><loc_22><loc_9><loc_86><loc_45></location>
<caption>Figure 5-1 Accidental update with masked values scenario</caption>
</figure>
<text><location><page_98><loc_22><loc_88><loc_89><loc_91></location>Obviously, careful planning and testing should be exercised to avoid accidental updates with masked values.</text>
<text><location><page_98><loc_22><loc_81><loc_89><loc_87></location>DB2 for i also enhanced its check constraint support in the IBM i 7.2 release with a new ON UPDATE clause that allows the existing value to be preserved when a masked value is detected by a check constraint. Details about how to employ this new check constraint support can be found in 6.8.1, "Check constraint solution" on page 108.</text>
@ -1316,7 +1399,7 @@
<section_header><location><page_102><loc_56><loc_61><loc_73><loc_62></location>With RCAC Masking</section_header>
<table>
<location><page_102><loc_23><loc_41><loc_49><loc_60></location>
<row_0><col_0><body>CREDIT CARD NUMBER _ _</col_0><col_1><body>TOTAL</col_1></row_0>
<row_0><col_0><col_header>CREDIT CARD NUMBER _ _</col_0><col_1><col_header>TOTAL</col_1></row_0>
<row_1><col_0><body>3785 0000 0000 1234</col_0><col_1><body>233.50</col_1></row_1>
<row_2><col_0><body>3785 1111 1111 1234</col_0><col_1><body>105.10</col_1></row_2>
<row_3><col_0><body>3785 2222 2222 1234</col_0><col_1><body>300 00 300.00</col_1></row_3>
@ -1328,10 +1411,11 @@
<row_9><col_0><body>6011 8888 8888 1234</col_0><col_1><body>750.33</col_1></row_9>
<row_10><col_0><body>6011 9999 9999 0001</col_0><col_1><body>10.00</col_1></row_10>
</table>
<caption><location><page_102><loc_51><loc_41><loc_77><loc_61></location>Figure 6-1 Timing of column masking</caption>
<caption><location><page_102><loc_22><loc_39><loc_47><loc_40></location>Figure 6-1 Timing of column masking</caption>
<table>
<location><page_102><loc_51><loc_41><loc_77><loc_61></location>
<row_0><col_0><body>CREDIT CARD NUMBER _ _</col_0><col_1><body>TOTAL</col_1></row_0>
<caption>Figure 6-1 Timing of column masking</caption>
<row_0><col_0><col_header>CREDIT CARD NUMBER _ _</col_0><col_1><col_header>TOTAL</col_1></row_0>
<row_1><col_0><body>**** **** **** 1234</col_0><col_1><body>233.50</col_1></row_1>
<row_2><col_0><body>**** **** **** 1234</col_0><col_1><body>105.10</col_1></row_2>
<row_3><col_0><body>**** **** **** 1234</col_0><col_1><body>300 00 300.00</col_1></row_3>
@ -1349,12 +1433,14 @@
<caption><location><page_103><loc_22><loc_32><loc_63><loc_33></location>Figure 6-2 Masking differences between Fieldproc and RCAC</caption>
<figure>
<location><page_103><loc_22><loc_33><loc_67><loc_68></location>
<caption>Figure 6-2 Masking differences between Fieldproc and RCAC</caption>
</figure>
<section_header><location><page_104><loc_11><loc_89><loc_56><loc_91></location>6.2 RCAC effects on data movement</section_header>
<text><location><page_104><loc_22><loc_80><loc_89><loc_87></location>As described earlier and shown in Figure 6-3, RCAC is applied pervasively regardless of the data access programming interface, SQL statement, or IBM i command. The effects of RCAC on data movement scenarios can be profound and possibly problematic. It is important to understand these effects and make the appropriate adjustments to avoid incorrect results or data loss.</text>
<caption><location><page_104><loc_22><loc_62><loc_48><loc_63></location>Figure 6-3 RCAC and data movement</caption>
<figure>
<location><page_104><loc_22><loc_63><loc_81><loc_78></location>
<caption>Figure 6-3 RCAC and data movement</caption>
</figure>
<text><location><page_104><loc_22><loc_50><loc_89><loc_60></location>The "user" that is running the data movement application or process, whether it be a high availability (HA) scenario, an extract, transform, load (ETL) scenario, or just copying data from one file or table to another one, must have permission to all the source rows without masking, and not be restricted from putting rows into the target. Allowing the data movement application or process to bypass the RCAC rules must be based on a clear and concise understanding of the organization's object security and data access policy. Proper design, implementation, and testing are critical success factors when applying RCAC.</text>
<text><location><page_104><loc_23><loc_40><loc_88><loc_47></location>Important: RCAC is applied to the table or physical file access. It is not applied to the journal receiver access. Any and all database transactions are represented in the journal regardless of RCAC row permissions and column masks. This makes it essential that IBM i security is used to ensure that only authorized personnel have access to the journaled data.</text>
@ -1370,6 +1456,7 @@
<caption><location><page_105><loc_22><loc_47><loc_62><loc_49></location>Figure 6-4 RCAC effects on data movement from SOURCE</caption>
<figure>
<location><page_105><loc_22><loc_49><loc_80><loc_80></location>
<caption>Figure 6-4 RCAC effects on data movement from SOURCE</caption>
</figure>
<section_header><location><page_105><loc_10><loc_43><loc_66><loc_45></location>6.2.2 Effects when RCAC is defined on the target table</section_header>
<text><location><page_105><loc_22><loc_39><loc_88><loc_42></location>Example 6-2 shows a simple example that illustrates the effect of RCAC as defined on the target table.</text>
@ -1379,6 +1466,7 @@
<caption><location><page_106><loc_22><loc_47><loc_60><loc_48></location>Figure 6-5 RCAC effects on data movement on TARGET</caption>
<figure>
<location><page_106><loc_22><loc_48><loc_82><loc_80></location>
<caption>Figure 6-5 RCAC effects on data movement on TARGET</caption>
</figure>
<section_header><location><page_106><loc_10><loc_42><loc_80><loc_44></location>6.2.3 Effects when RCAC is defined on both source and target tables</section_header>
<text><location><page_106><loc_22><loc_38><loc_89><loc_41></location>Example 6-3 shows a simple example that illustrates the effect of RCAC as defined on both the source and the target tables.</text>
@ -1389,6 +1477,7 @@
<caption><location><page_107><loc_22><loc_50><loc_70><loc_51></location>Figure 6-6 RCAC effects on data movement on SOURCE and TARGET</caption>
<figure>
<location><page_107><loc_22><loc_51><loc_82><loc_83></location>
<caption>Figure 6-6 RCAC effects on data movement on SOURCE and TARGET</caption>
</figure>
<section_header><location><page_107><loc_11><loc_44><loc_43><loc_46></location>6.3 RCAC effects on joins</section_header>
<text><location><page_107><loc_22><loc_38><loc_89><loc_42></location>As mentioned previously, a fundamental concept of row permission is that it defines a logical subset of rows that a user or group of users is permitted to access and use. This subset becomes the new basis of any query against the table that has RCAC enabled.</text>
@ -1397,41 +1486,48 @@
<caption><location><page_108><loc_22><loc_58><loc_55><loc_60></location>Figure 6-7 Set A and set B with row permissions</caption>
<figure>
<location><page_108><loc_22><loc_60><loc_82><loc_86></location>
<caption>Figure 6-7 Set A and set B with row permissions</caption>
</figure>
<section_header><location><page_108><loc_10><loc_54><loc_27><loc_56></location>6.3.1 Inner joins</section_header>
<text><location><page_108><loc_22><loc_50><loc_89><loc_53></location>Inner join defines the intersection of two data sets. For a row to be returned from the inner join query, it must appear in both sets, as shown in Figure 6-8.</text>
<caption><location><page_108><loc_22><loc_20><loc_53><loc_21></location>Figure 6-8 Inner join without RCAC permission</caption>
<figure>
<location><page_108><loc_22><loc_21><loc_76><loc_48></location>
<caption>Figure 6-8 Inner join without RCAC permission</caption>
</figure>
<text><location><page_109><loc_22><loc_85><loc_89><loc_91></location>Given that row permission serves to eliminate logically rows from one or more sets, the result set from an inner join (and a subquery) can be different when RCAC is applied. RCAC can reduce the number of rows that are permitted to be accessed by the join, as shown in Figure 6-9.</text>
<text><location><page_109><loc_23><loc_79><loc_88><loc_83></location>Effect of column masks on inner joins: Because column masks are applied after the query final results are determined, the masked value has no effect on the join processing and corresponding query result set.</text>
<caption><location><page_109><loc_22><loc_45><loc_51><loc_47></location>Figure 6-9 Inner join with RCAC permission</caption>
<figure>
<location><page_109><loc_22><loc_47><loc_77><loc_76></location>
<caption>Figure 6-9 Inner join with RCAC permission</caption>
</figure>
<section_header><location><page_110><loc_10><loc_89><loc_28><loc_91></location>6.3.2 Outer joins</section_header>
<text><location><page_110><loc_22><loc_82><loc_89><loc_88></location>Outer joins preserve one or both sides of two data sets. A row can be returned from the outer join query if it appears in the primary set (LEFT, RIGHT, or both in the case of FULL), as shown in Figure 6-10. Column values from the secondary set are returned if the row has a match in the primary set. Otherwise, NULL is returned for the column value by default.</text>
<caption><location><page_110><loc_22><loc_51><loc_55><loc_53></location>Figure 6-10 Outer join without RCAC permission</caption>
<figure>
<location><page_110><loc_22><loc_53><loc_78><loc_80></location>
<caption>Figure 6-10 Outer join without RCAC permission</caption>
</figure>
<text><location><page_111><loc_22><loc_87><loc_87><loc_91></location>Given that row permission serves to eliminate logically rows from one or more sets, more column values that are returned from the secondary table in outer join can be NULL when RCAC is applied, as shown in Figure 6-11.</text>
<text><location><page_111><loc_23><loc_80><loc_88><loc_85></location>Effect of column masks on inner joins: Because column masks are applied after the query final results are determined, the masked value has no effect on the join processing and corresponding query result set.</text>
<caption><location><page_111><loc_22><loc_45><loc_53><loc_46></location>Figure 6-11 Outer join with RCAC permission</caption>
<figure>
<location><page_111><loc_22><loc_47><loc_79><loc_77></location>
<caption>Figure 6-11 Outer join with RCAC permission</caption>
</figure>
<section_header><location><page_112><loc_11><loc_89><loc_32><loc_91></location>6.3.3 Exception joins</section_header>
<text><location><page_112><loc_22><loc_82><loc_89><loc_88></location>Exception joins preserve one side of two data sets. A row can be returned from the exception join query if it appears in the primary set (LEFT or RIGHT) and the row does not appear in the secondary set, as shown in Figure 6-12. Column values from the secondary set are returned as NULL by default.</text>
<caption><location><page_112><loc_22><loc_49><loc_57><loc_50></location>Figure 6-12 Exception join without RCAC permission</caption>
<figure>
<location><page_112><loc_22><loc_50><loc_79><loc_80></location>
<caption>Figure 6-12 Exception join without RCAC permission</caption>
</figure>
<text><location><page_112><loc_22><loc_41><loc_89><loc_47></location>Given that row permission serves to eliminate logically rows from one or more sets, more rows can appear to be exceptions when RCAC is applied, as shown in Figure 6-13. Also, because column masks are applied after the query final results are determined, the masked value has no effect on the join processing and corresponding query result set.</text>
<caption><location><page_112><loc_22><loc_8><loc_55><loc_9></location>Figure 6-13 Exception join with RCAC permission</caption>
<figure>
<location><page_112><loc_22><loc_9><loc_79><loc_39></location>
<caption>Figure 6-13 Exception join with RCAC permission</caption>
</figure>
<section_header><location><page_113><loc_11><loc_89><loc_77><loc_91></location>6.4 Monitoring, analyzing, and debugging with RCAC</section_header>
<text><location><page_113><loc_22><loc_83><loc_89><loc_87></location>It is assumed (and it is a critical success factor) that the database engineer or application developer has a thorough understanding of the DB2 for i Query Optimizer, Database Engine, and all the associated tools and techniques.</text>
@ -1450,21 +1546,25 @@
<caption><location><page_114><loc_10><loc_54><loc_49><loc_56></location>Figure 6-14 Visual Explain indicating that RCAC is applied</caption>
<figure>
<location><page_114><loc_10><loc_56><loc_89><loc_88></location>
<caption>Figure 6-14 Visual Explain indicating that RCAC is applied</caption>
</figure>
<text><location><page_114><loc_22><loc_51><loc_87><loc_52></location>Figure 6-15 shows the main dashboard of an SQL Performance Monitor. Click Summary .</text>
<caption><location><page_114><loc_10><loc_30><loc_36><loc_31></location>Figure 6-15 SQL Performance Monitor</caption>
<figure>
<location><page_114><loc_10><loc_32><loc_89><loc_49></location>
<caption>Figure 6-15 SQL Performance Monitor</caption>
</figure>
<text><location><page_114><loc_22><loc_25><loc_86><loc_28></location>Figure 6-16 shows the summary of an SQL Performance Monitor with an indication that RCAC is applied.</text>
<caption><location><page_114><loc_11><loc_12><loc_57><loc_13></location>Figure 6-16 SQL Performance Monitor indicating that RCAC is applied</caption>
<figure>
<location><page_114><loc_10><loc_13><loc_89><loc_23></location>
<caption>Figure 6-16 SQL Performance Monitor indicating that RCAC is applied</caption>
</figure>
<text><location><page_115><loc_22><loc_88><loc_84><loc_91></location>Figure 6-17 shows the statements of an SQL Performance Monitor and how RCAC is externalized.</text>
<caption><location><page_115><loc_11><loc_71><loc_57><loc_72></location>Figure 6-17 SQL Performance Monitor showing statements and RCAC</caption>
<figure>
<location><page_115><loc_10><loc_73><loc_89><loc_86></location>
<caption>Figure 6-17 SQL Performance Monitor showing statements and RCAC</caption>
</figure>
<text><location><page_115><loc_22><loc_59><loc_89><loc_69></location>When implementing RCAC as part of a comprehensive and pervasive data access control initiative, consider that the database monitoring and analysis tools can collect literal values that are passed as part of SQL statements. These literal values can be viewed as part of the information collected. If any of the literals are based on or are used with masked columns, it is important to review the database engineer's policy for viewing these data elements. For example, supposed that column CUSTOMER_TAX_ID is deemed masked for the database engineer and the CUSTOMER_TAX_ID column is used in a predicate as follows:</text>
<text><location><page_115><loc_22><loc_56><loc_53><loc_58></location>WHERE CUSTOMER_TAX_ID = '123-45-7890'</text>
@ -1477,11 +1577,13 @@
<caption><location><page_116><loc_22><loc_51><loc_47><loc_52></location>Figure 6-18 Index advice and RCAC</caption>
<figure>
<location><page_116><loc_22><loc_52><loc_89><loc_86></location>
<caption>Figure 6-18 Index advice and RCAC</caption>
</figure>
<text><location><page_116><loc_22><loc_46><loc_87><loc_49></location>In Figure 6-19, index advisor is showing an index for the ACCOUNTS and CUSTOMERS tables based on the RCAC rule text.</text>
<caption><location><page_116><loc_11><loc_28><loc_44><loc_30></location>Figure 6-19 Index advisor based on the RCAC rule</caption>
<figure>
<location><page_116><loc_10><loc_30><loc_83><loc_44></location>
<caption>Figure 6-19 Index advisor based on the RCAC rule</caption>
</figure>
<text><location><page_116><loc_22><loc_24><loc_89><loc_26></location>For more information about creating and using indexes, see IBM DB2 for i indexing methods and strategies , found at:</text>
<text><location><page_116><loc_22><loc_20><loc_89><loc_23></location>http://www.ibm.com/partnerworld/wps/servlet/ContentHandler/stg_ast_sys_wp_db2_i_in dexing_methods_strategies</text>
@ -1531,11 +1633,13 @@
<caption><location><page_118><loc_22><loc_32><loc_51><loc_33></location>Figure 6-21 View definition and user query</caption>
<figure>
<location><page_118><loc_22><loc_33><loc_80><loc_67></location>
<caption>Figure 6-21 View definition and user query</caption>
</figure>
<text><location><page_119><loc_22><loc_88><loc_85><loc_91></location>What the query optimizer plans for and what the database engine runs is shown in the Figure 6-22.</text>
<caption><location><page_119><loc_22><loc_50><loc_48><loc_51></location>Figure 6-22 Query rewrite with RCAC</caption>
<figure>
<location><page_119><loc_22><loc_51><loc_83><loc_87></location>
<caption>Figure 6-22 Query rewrite with RCAC</caption>
</figure>
<section_header><location><page_119><loc_10><loc_46><loc_42><loc_47></location>6.5.2 Materialized query tables</section_header>
<text><location><page_119><loc_22><loc_40><loc_89><loc_44></location>When the query to populate a materialized query table (MQT) is run by the system on either the create table or a refresh table, and one or more source tables have RCAC defined, the row permissions and column masks are ignored. This means that the MQT has all of the data.</text>
@ -1574,11 +1678,13 @@
<caption><location><page_122><loc_22><loc_48><loc_54><loc_49></location>Figure 6-23 Native record access with no RCAC</caption>
<figure>
<location><page_122><loc_22><loc_49><loc_83><loc_83></location>
<caption>Figure 6-23 Native record access with no RCAC</caption>
</figure>
<text><location><page_123><loc_22><loc_85><loc_89><loc_91></location>Before the record, as identified by the key, is considered available, the RCAC logic must be run. If the record is rejected by RCAC, the next record in sequence that is permissible must be identified. This spinning through the records can take a long time and uses many resources, as shown in Figure 6-24.</text>
<caption><location><page_123><loc_22><loc_47><loc_56><loc_48></location>Figure 6-24 Native record level access with RCAC</caption>
<figure>
<location><page_123><loc_22><loc_49><loc_84><loc_83></location>
<caption>Figure 6-24 Native record level access with RCAC</caption>
</figure>
<text><location><page_123><loc_22><loc_42><loc_86><loc_45></location>After the row permissions and column masks are designed and implemented, adequate performance and scalability testing are recommended.</text>
<section_header><location><page_123><loc_11><loc_37><loc_83><loc_39></location>6.7 Exclusive lock to implement RCAC (availability issues)</section_header>
@ -1635,6 +1741,7 @@
<caption><location><page_127><loc_22><loc_28><loc_60><loc_29></location>Figure 6-25 Object-level security and RCAC permissions</caption>
<figure>
<location><page_127><loc_22><loc_29><loc_83><loc_65></location>
<caption>Figure 6-25 Object-level security and RCAC permissions</caption>
</figure>
<text><location><page_127><loc_22><loc_23><loc_89><loc_25></location>To get access to the table and the rows, the user must pass the object level authority test and the RCAC permission test.</text>
<text><location><page_127><loc_22><loc_17><loc_89><loc_21></location>The IBM i journal captures the transactional data and places an image of the row in the journal receiver. If the user has access to the journal receiver, the row image can be viewed if the user has authority to the journal receiver.</text>
@ -1675,6 +1782,7 @@
<caption><location><page_132><loc_22><loc_41><loc_55><loc_42></location>Figure 7-1 Restoring tables to different schemas</caption>
<figure>
<location><page_132><loc_22><loc_43><loc_85><loc_73></location>
<caption>Figure 7-1 Restoring tables to different schemas</caption>
</figure>
<text><location><page_132><loc_22><loc_30><loc_89><loc_39></location>The only way to fix this issue is to re-create the row permission or column mask after the restore operation. Re-creation of the row permission or column mask is required only for definitions that reference other DB2 objects, but it is simpler to re-create all of the RCAC definitions instead of a subset. For example, generate the SQL using System i Navigator, clear the "Schema qualify names for objects" and select the "OR REPLACE clause", and then run the generated script.</text>
<section_header><location><page_132><loc_10><loc_26><loc_32><loc_28></location>7.2.2 Table migration</section_header>

File diff suppressed because one or more lines are too long

View File

@ -459,6 +459,7 @@ As shown in Figure 1-1, it is an all-or-nothing access to the rows of a table.
Figure 1-1 All-or-nothing access to the rows of a table
<!-- image -->
Many businesses are trying to limit data access to a need-to-know basis. This security goal means that users should be given access only to the minimum set of data that is required to perform their job. Often, users with object-level access are given access to row and column values that are beyond what their business task requires because that object-level security provides an all-or-nothing solution. For example, object-level controls allow a manager to access data about all employees. Most security policies limit a manager to accessing data only for the employees that they manage.
@ -473,6 +474,7 @@ Even if you are willing to live with these performance and management issues, a
Figure 1-2 Existing row and column controls
<!-- image -->
## 1.3.2 New controls: Row and Column Access Control
@ -583,6 +585,7 @@ The FUNCTION_USAGE view contains function usage configuration details. Table 2-1
Table 2-1 FUNCTION_USAGE view
| Column name | Data type | Description |
|---------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| FUNCTION_ID | VARCHAR(30) | ID of the function. |
@ -614,6 +617,7 @@ Table 2-2 shows a comparison of the different function usage IDs and *JOBCTL aut
Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority
| User action | *JOBCTL | QIBM_DB_SECADM | QIBM_DB_SQLADM | QIBM_DB_SYSMON | No Authority |
|--------------------------------------------------------------------------------|-----------|------------------|------------------|------------------|----------------|
| SET CURRENT DEGREE (SQL statement) | X | | X | | |
@ -707,6 +711,9 @@ The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to defin
Figure 3-1 CREATE PERMISSION SQL statement
The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.Figure 3-1 CREATE PERMISSION SQL statement
<!-- image -->
## Column mask
@ -717,6 +724,7 @@ Column masks replace the need to create and use views to implement access contro
Figure 3-2 CREATE MASK SQL statement
<!-- image -->
## 3.1.2 Enabling and activating RCAC
@ -729,6 +737,7 @@ Note: An exclusive lock is required on the table object to perform the alter ope
Figure 3-3 ALTER PERMISSION and ALTER MASK SQL statements
<!-- image -->
You can activate and deactivate RCAC for new or existing tables by using the SQL ALTER TABLE statement (Figure 3-4). The ACTIVATE or DEACTIVATE clause must be the option that is specified in the statement. No other alterations are permitted at the same time. The activating and deactivating effectively turns on or off all RCAC processing for the table. Only enabled row permissions and column masks take effect when activating RCAC.
@ -737,6 +746,7 @@ Note: An exclusive lock is required on the table object to perform the alter ope
Figure 3-4 ALTER TABLE SQL statement
<!-- image -->
When row access control is activated on a table, a default permission is established for that table. The name of this permission is QIBM_DEFAULT_ <table-name>_<schema-name>. This default permission contains a simple piece of logic (0=1) which is never true. The default permission effectively denies access to every user unless there is a permission defined that allows access explicitly. If row access control is activated on a table, and there is no permission that is defined, no one has permission to any rows. All queries against the table produce an empty set.
@ -769,6 +779,7 @@ Table 3-1 summarizes these special registers and their values.
Table 3-1 Special registers and their corresponding values
| Special register | Corresponding value |
|----------------------|---------------------------------------------------------------------------------------------------------------------------------------|
| USER or SESSION_USER | The effective user of the thread excluding adopted authority. |
@ -789,6 +800,7 @@ GLYPH<SM590000> When proc1 ends, the session reverts to its original state with
Figure 3-5 Special registers and adopted authority
<!-- image -->
## 3.2.2 Built-in global variables
@ -801,6 +813,7 @@ Table 3-2 lists the nine built-in global variables.
Table 3-2 Built-in global variables
| Global variable | Type | Description |
|-----------------------|--------------|----------------------------------------------------------------|
| CLIENT_HOST | VARCHAR(255) | Host name of the current client as returned by the system |
@ -947,6 +960,7 @@ The result of this query is shown in Figure 3-7, which is the total number of em
Figure 3-7 Number of employees
<!-- image -->
2. Run a second SQL statement (shown in Example 3-6) that lists the employees. If you have read access to the table, you see all the rows no matter who you are.
@ -983,6 +997,7 @@ CREATE PERMISSION HR_SCHEMA.PERMISSION1_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS E
Figure 3-9 Row permissions that are shown in System i Navigator
<!-- image -->
## 3.6.5 Defining and creating column masks
@ -1027,6 +1042,7 @@ CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYE
Figure 3-10 Column masks shown in System i Navigator
<!-- image -->
## 3.6.6 Activating RCAC
@ -1047,12 +1063,14 @@ Now that you have created the row permission and the two column masks, RCAC must
Figure 3-11 Selecting the EMPLOYEES table from System i Navigator
<!-- image -->
3. The EMPLOYEES table definition is displayed, as shown in Figure 3-12. Note that the Row access control and Column access control options are checked.
Figure 3-12 RCAC enabled on the EMPLOYEES table
<!-- image -->
## 3.6.7 Demonstrating data access with RCAC
@ -1069,24 +1087,28 @@ SELECT COUNT(*) as ROW_COUNT FROM HR_SCHEMA.EMPLOYEES;
Figure 3-13 Count of EMPLOYEES by HR
<!-- image -->
3. The result of the same query for a user who is logged on as TQSPENSER (Manager) is shown in Figure 3-14. TQSPENSER has five employees in his department and he can also see his own row, which is why the count is 6.
Figure 3-14 Count of EMPLOYEES by a manager
<!-- image -->
4. The result of the same query that is run by an employee (DSSMITH) gives the result that is shown in Figure 3-15. Each employee can see only his or her own data (row).
Figure 3-15 Count of EMPLOYEES by an employee
<!-- image -->
5. The result of the same query that is run by the Consultant/DBE gives the result that is shown in Figure 3-16. The consultants/DBE can manage and implement RCAC, but they do not see any rows at all.
Figure 3-16 Count of EMPLOYEES by a consultant
<!-- image -->
Does the result make sense? Yes, it does because RCAC is enabled.
@ -1213,6 +1235,7 @@ GLYPH<SM590000> The row permission for the TRANSACTIONS table is based on the AC
Figure 4-1 Internet banking example
<!-- image -->
## 4.2 Description of the users roles and responsibilities
@ -1235,12 +1258,14 @@ Based on their respective roles and responsibilities, the users (that is, a grou
Figure 4-2 Rules for row and column access
<!-- image -->
The chart that is shown in Figure 4-3 shows the column access that is allowed by group and lists the column masks by table.
Figure 4-3 Column masks
| | | CUSTOMERS | ACCOUNTS |
|----------|----------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------|
| SECURITY | No Rows | CUSTOMER_DRIVERS_LICENSE_NUMBER CUSTOMER_EMAIL CUSTOMER_LOGIN_ID CUSTOMER_SECURITY_QUESTION CUSTOMER_SECURITY_QUESTION_ANSWER CUSTOMER_TAX_ID | ACCOUNT_NUMBER |
@ -1272,6 +1297,7 @@ Figure 4-4 shows the data model of the banking scenario that is used in this exa
Figure 4-4 Data model of the banking scenario
<!-- image -->
This section covers the following steps:
@ -1304,6 +1330,7 @@ This section reviews the tables that are used in this example. As shown in Figur
Figure 4-5 Tables that are used in the banking example
<!-- image -->
Note: Appendix A, "Database definitions for the RCAC banking example" on page 121 provides a script that you can use to create all the database definitions or DDLs to re-create this RCAC example.
@ -1316,6 +1343,7 @@ To review the attributes of each table that is used in this banking example, com
Figure 4-6 CUSTOMERS table attributes
<!-- image -->
3. Click the Columns tab to see the columns of the CUSTOMERS table, as shown in Figure 4-7.
@ -1326,12 +1354,14 @@ Figure 4-7 Column definitions of the CUSTOMERS table
Figure 4-8 Reviewing the constraints on the CUSTOMERS table
<!-- image -->
5. Review the definition of the ACCOUNTS table. The definition of the ACCOUNTS table is shown in Figure 4-9. RCAC has not been defined for this table yet.
Figure 4-9 ACCOUNTS table attributes
<!-- image -->
6. Click the Columns tab to see the columns of the ACCOUNTS table, as shown in Figure 4-10.
@ -1342,12 +1372,14 @@ Figure 4-10 Column definitions of the ACCOUNTS table
Figure 4-11 Reviewing the constraints on the ACCOUNTS table
<!-- image -->
8. Review the definition of the TRANSACTIONS table. The definition of the TRANSACTIONS table is shown in Figure 4-12. RCAC is not defined for this table yet.
Figure 4-12 TRANSACTIONS table attributes
<!-- image -->
9. Click the Columns tab to see the columns of the TRANSACTIONS table, as shown in Figure 4-13.
@ -1358,6 +1390,7 @@ Figure 4-13 Column definitions of the TRANSACTIONS table
Figure 4-14 Reviewing the constraints on the TRANSACTIONS table
<!-- image -->
Now that you have reviewed the database model for this example, the following sections describe the steps that are required to implement RCAC in this banking scenario.
@ -1372,36 +1405,42 @@ The first step is to assign the appropriate function usage ID to the Database En
Figure 4-15 Application administration
<!-- image -->
2. The Application Administration window opens, as shown in Figure 4-16. Click IBM i  Database and select the function usage ID of Database Security Administrator .
Figure 4-16 Application administration for IBM i
<!-- image -->
3. Click Customize for the function usage ID of Database Security Administrator, as shown in Figure 4-17.
Figure 4-17 Customizing the Database Security Administrator function usage ID
<!-- image -->
4. The Customize Access window opens, as shown in Figure 4-18. Click the users that need to implement RCAC. For this example, HBEDOYA and MCAIN are selected. Click Add and then click OK .
Figure 4-18 Customize Access window
<!-- image -->
5. The Application Administrator window opens again. The function usage ID of Database Security Administrator now has an X in the Customized Access column, as shown in Figure 4-19.
Figure 4-19 Function usage ID Database Security Administrator customized
<!-- image -->
6. Run an SQL query that shows which user profiles are enabled to define RCAC. The SQL query is shown in Figure 4-20.
Figure 4-20 Query to display user profiles with function usage ID for RCAC
<!-- image -->
## 4.3.3 Creating group profiles for the users and their roles
@ -1414,6 +1453,7 @@ The next step is to create the different group profiles (ADMIN, CUSTOMER, TELLER
Figure 4-21 Creating group profiles
<!-- image -->
2. The New Group window opens, as shown in Figure 4-22. For each new group, enter the Group name (ADMIN, CUSTOMER, TELLER, and DBE) and add the user profiles that are associated to this group by selecting the user profile and clicking Add .
@ -1422,12 +1462,14 @@ Figure 4-22 shows adding user TQSPENCER to the TELLER group profile.
Figure 4-22 Creating group profiles and adding users
<!-- image -->
3. After you create all the group profiles, you should see them listed in System i Navigator under Users and Groups  Groups , as shown in Figure 4-23.
Figure 4-23 Newly created group profiles
<!-- image -->
## 4.3.4 Creating the CUSTOMER_LOGIN_ID global variable
@ -1440,24 +1482,28 @@ In this step, you create a global variable that is used to capture the Customer_
Figure 4-24 Creating a global variable
<!-- image -->
2. The New Global Variable window opens, as shown in Figure 4-25. Enter the global variable name of CUSTOMER_LOGIN_ID, select the data type of VARCHAR, and leave the default value of NULL. This default value ensures that users that do not use the web interface do not have permission to access the data. Click OK .
Figure 4-25 Creating a global variable called CUSTOMER_LOGIN_ID
<!-- image -->
3. Now that the global variable is created, assign permissions to the variable so that it can be set by the program. Right-click the CUSTOMER_LOGIN_ID global variable and select Permissions , as shown in Figure 4-26.
Figure 4-26 Setting permissions on the CUSTOMER_LOGIN_ID global variable
<!-- image -->
4. The Permissions window opens, as shown in Figure 4-27. Select Change authority for Webuser so that the application can set this global variable.
Figure 4-27 Setting change permissions for Webuser on the CUSTOMER_LOGIN_ID global variable
<!-- image -->
## 4.3.5 Defining and creating row permissions
@ -1468,6 +1514,7 @@ You now ready to define the row permissions of the tables. Complete the followin
Figure 4-28 Selecting new row permissions
<!-- image -->
2. The New Row Permission window opens, as shown in Figure 4-29. Enter the information regarding the row permissions on the CUSTOMERS table. This row permission defines what is established in the following policy:
@ -1482,6 +1529,7 @@ Select the Enabled option. Click OK .
Figure 4-29 New row permissions on the CUSTOMERS table
<!-- image -->
3. Define the row permissions for the ACCOUNTS table. The New Row Permission window opens, as shown in Figure 4-30. Enter the information regarding the row permissions on the ACCOUNTS table. This row permission defines what is established in the following policy:
@ -1496,6 +1544,7 @@ Select the Enabled option. Click OK .
Figure 4-30 New row permissions on the ACCOUNTS table
<!-- image -->
4. Define the row permissions on the TRANSACTIONS table. The New Row Permission window opens, as shown in Figure 4-31. Enter the information regarding the row permissions on the TRANSACTIONS table. This row permission defines what is established in the following policy:
@ -1512,12 +1561,14 @@ Select the Enabled option. Click OK .
Figure 4-31 New row permissions on the TRANSACTIONS table
<!-- image -->
5. To verify that the row permissions are enabled, from System i Navigator, click Row Permissions , as shown in Figure 4-32. The three row permissions are created and enabled.
Figure 4-32 List of row permissions on BANK_SCHEMA
<!-- image -->
## 4.3.6 Defining and creating column masks
@ -1528,6 +1579,7 @@ This section defines the masks on the columns. Complete the following steps:
Figure 4-33 Creating a column mask
<!-- image -->
2. In the New Column Mask window, which is shown in Figure 4-34, enter the following information:
@ -1542,6 +1594,7 @@ Select the Enabled option. Click OK .
Figure 4-34 Defining a column mask on the CUSTOMERS table
<!-- image -->
3. Repeat steps 1 on page 58 and 2 to create column masks for the following columns:
@ -1562,6 +1615,7 @@ Figure 4-34 Defining a column mask on the CUSTOMERS table
Figure 4-35 List of column masks on BANK_SCHEMA
<!-- image -->
## 4.3.7 Restricting the inserting and updating of masked data
@ -1574,12 +1628,14 @@ This step defines the check constraints that support the column masks to make su
Figure 4-36 Definition of the CUSTOMERS table
<!-- image -->
2. From the CUSTOMERS definition window, click the Check Constraints tab and click Add , as shown in Figure 4-37.
Figure 4-37 Adding a check constraint
<!-- image -->
3. The New Check Constraint window opens, as shown in Figure 4-38. Complete the following steps:
@ -1592,18 +1648,21 @@ c. Select the On update violation, preserve column value option and click OK .
Figure 4-38 Specifying a new check constraint on the CUSTOMERS table
<!-- image -->
4. Figure 4-39 shows that there is now a check constraint on the CUSTOMERS table that prevents any masked data from being updated to the CUSTOMER_EMAIL column.
Figure 4-39 Check constraint on the CUSTOMERS table
<!-- image -->
5. Create all the other check constraints that are associated to each of the masks on the CUSTOMERS table. After this is done, these constraints should look like the ones that are shown in Figure 4-40.
Figure 4-40 List of check constraints on the CUSTOMERS table
<!-- image -->
## 4.3.8 Activating row and column access control
@ -1614,18 +1673,21 @@ You are now ready to activate RCAC on all three tables in this example. Complete
Figure 4-41 Enabling RCAC on the CUSTOMERS table
<!-- image -->
2. Enable RCAC on the ACCOUNTS table. Right-click the ACCOUNTS table and select Definition . As shown Figure 4-42, make sure that you select Row access control and Column access control . Click OK .
Figure 4-42 Enabling RCAC on ACCOUNTS
<!-- image -->
3. Enable RCAC on the TRANSACTIONS table. Right-click the TRANSACTIONS table and select Definition . As shown in Figure 4-43, make sure that you select Row access control . Click OK .
Figure 4-43 Enabling RCAC on TRANSACTIONS
<!-- image -->
## 4.3.9 Reviewing row permissions
@ -1636,18 +1698,21 @@ This section displays all the row permissions after enabling RCAC. Complete the
Figure 4-44 Row permissions after enabling RCAC
<!-- image -->
2. Look at one of the row permission definitions by right-clicking it and selecting Definition , as shown in Figure 4-45.
Figure 4-45 Selecting row permission definition
<!-- image -->
3. A window opens, as shown in Figure 4-46. Take note of the nonsensical search condition (0=1) of the QIBM_DEFAULT row permission. This permission is ORed with all of the others and it ensures that if someone does not meet any of the criteria from the row permission then this condition is tested, and because it is false the access is denied.
Figure 4-46 Search condition of the QIBM_DEFAULT row permission
<!-- image -->
## 4.3.10 Demonstrating data access with RCAC
@ -1678,12 +1743,14 @@ To test a DBE (MCAIN) user, complete the following steps:
Figure 4-47 DBE session user
<!-- image -->
2. The number of rows that the DBE user MCAIN can see is shown in Figure 4-48.
Figure 4-48 Number of rows that DBE user can see in the CUSTOMERS table
<!-- image -->
3. The result of the third SQL statement is shown in Figure 4-49. Note the masked columns. User MCAIN can see all the rows in the CUSTOMERS table, but there are some columns where the result is masked.
@ -1698,18 +1765,21 @@ To test a SECURITY user, complete the following steps:
Figure 4-50 SECURITY session user
<!-- image -->
2. The number of rows in the CUSTOMERS table that the security officer can see is shown in Figure 4-51. The security officer cannot see any data at all.
Figure 4-51 Number of rows that the security officer can see in the CUSTOMERS table
<!-- image -->
3. The result of the third SQL statement is shown in Figure 4-52. Note the empty set that is returned to the security officer.
Figure 4-52 SQL statement that is run by the SECURITY user - no results
<!-- image -->
## Data access for TELLER user with RCAC
@ -1720,18 +1790,21 @@ To test a Teller (TQSPENCER) user, complete the following steps:
Figure 4-53 TELLER session user
<!-- image -->
2. The number of rows in the CUSTOMERS table that the TELLER user can see is shown in Figure 4-54. The TELLER user can see all the rows.
Figure 4-54 Number of rows that the TELLER user can see in the CUSTOMERS table
<!-- image -->
3. The result of the third SQL statement is shown in Figure 4-55. Note the masked columns. The TELLER user, TQSPENSER, can see all the rows, but there are some columns where the result is masked.
Figure 4-55 SQL statement that is run by the TELLER user with masked columns
<!-- image -->
## Data access for ADMIN user with RCAC
@ -1742,12 +1815,14 @@ To test an ADMIN (VGLUCCHESS) user, complete the following steps:
Figure 4-56 ADMIN session user
<!-- image -->
2. The number of rows that the ADMIN user can see is shown in Figure 4-57. The ADMIN user can see all the rows.
Figure 4-57 Number of rows that the ADMIN can see in the CUSTOMERS table
<!-- image -->
3. The result of the third SQL statement is shown in Figure 4-58. There are no masked columns.
@ -1762,30 +1837,35 @@ To test a CUSTOMERS (WEBUSER) user that accesses the database by using the web a
Figure 4-59 WEBUSER session user
<!-- image -->
2. A global variable (CUSTOMER_LOGIN_ID) is set by the web application and then is used to check the row permissions. Figure 4-60 shows setting the global variable by using the customer login ID.
Figure 4-60 Setting the global variable CUSTOMER_LOGIN_ID
<!-- image -->
3. Verify that the global variable was set with the correct value by clicking the Global Variable tab, as shown in Figure 4-61.
Figure 4-61 Viewing the global variable value
<!-- image -->
4. The number of rows that the WEBUSER can see is shown in Figure 4-62. This user can see only the one row that belongs to his web-based user ID.
Figure 4-62 Number of rows that the WEBUSER can see in the CUSTOMERS table
<!-- image -->
5. The result of the third SQL statement is shown in Figure 4-63. There are no masked columns, and the user can see only one row, which is the user's own row.
Figure 4-63 SQL statement that is run by WEBUSER - no masked columns
<!-- image -->
## Other examples of data access with RCAC
@ -1814,24 +1894,28 @@ This section looks at some other interesting information that is related to RCAC
Figure 4-67 Visual Explain with no RCAC enabled
<!-- image -->
2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.
Figure 4-68 Visual Explain with RCAC enabled
<!-- image -->
3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.
Figure 4-69 Index advice with no RCAC
<!-- image -->
4. Now, look at the advised indexes with RCAC enabled. As shown in Figure 4-70, there is an additional index being advised, which is basically for the row permission rule. For more information, see 6.4.2, "Index advisor" on page 99.
Figure 4-70 Index advice with RCAC enabled
<!-- image -->
<!-- image -->
@ -1922,6 +2006,7 @@ In this example, the application reads the data for an update to correct the mis
Figure 5-1 Accidental update with masked values scenario
<!-- image -->
Obviously, careful planning and testing should be exercised to avoid accidental updates with masked values.
@ -2039,6 +2124,7 @@ CREDIT_CARD_NUMBER;
Figure 6-1 Timing of column masking
| CREDIT CARD NUMBER _ _ | TOTAL |
|---------------------------|---------------|
| **** **** **** 1234 | 233.50 |
@ -2060,6 +2146,7 @@ Note: Column masks can influence an SQL INSERT or UPDATE . For example, you cann
Figure 6-2 Masking differences between Fieldproc and RCAC
<!-- image -->
## 6.2 RCAC effects on data movement
@ -2068,6 +2155,7 @@ As described earlier and shown in Figure 6-3, RCAC is applied pervasively regard
Figure 6-3 RCAC and data movement
<!-- image -->
The "user" that is running the data movement application or process, whether it be a high availability (HA) scenario, an extract, transform, load (ETL) scenario, or just copying data from one file or table to another one, must have permission to all the source rows without masking, and not be restricted from putting rows into the target. Allowing the data movement application or process to bypass the RCAC rules must be based on a clear and concise understanding of the organization's object security and data access policy. Proper design, implementation, and testing are critical success factors when applying RCAC.
@ -2094,6 +2182,7 @@ For example, given a "source" table with a row permission defined as NAME <> 'CA
Figure 6-4 RCAC effects on data movement from SOURCE
<!-- image -->
## 6.2.2 Effects when RCAC is defined on the target table
@ -2108,6 +2197,7 @@ Given a "target" table with a row permission defined as NAME <> 'CAIN' and a col
Figure 6-5 RCAC effects on data movement on TARGET
<!-- image -->
## 6.2.3 Effects when RCAC is defined on both source and target tables
@ -2124,6 +2214,7 @@ Although the source rows where NAME <> 'CAIN' do satisfy the target table's perm
Figure 6-6 RCAC effects on data movement on SOURCE and TARGET
<!-- image -->
## 6.3 RCAC effects on joins
@ -2136,6 +2227,7 @@ As shown in Figure 6-7, there are two different sets, set A and set B. However,
Figure 6-7 Set A and set B with row permissions
<!-- image -->
## 6.3.1 Inner joins
@ -2144,6 +2236,7 @@ Inner join defines the intersection of two data sets. For a row to be returned f
Figure 6-8 Inner join without RCAC permission
<!-- image -->
Given that row permission serves to eliminate logically rows from one or more sets, the result set from an inner join (and a subquery) can be different when RCAC is applied. RCAC can reduce the number of rows that are permitted to be accessed by the join, as shown in Figure 6-9.
@ -2152,6 +2245,7 @@ Effect of column masks on inner joins: Because column masks are applied after th
Figure 6-9 Inner join with RCAC permission
<!-- image -->
## 6.3.2 Outer joins
@ -2160,6 +2254,7 @@ Outer joins preserve one or both sides of two data sets. A row can be returned f
Figure 6-10 Outer join without RCAC permission
<!-- image -->
Given that row permission serves to eliminate logically rows from one or more sets, more column values that are returned from the secondary table in outer join can be NULL when RCAC is applied, as shown in Figure 6-11.
@ -2168,6 +2263,7 @@ Effect of column masks on inner joins: Because column masks are applied after th
Figure 6-11 Outer join with RCAC permission
<!-- image -->
## 6.3.3 Exception joins
@ -2176,12 +2272,14 @@ Exception joins preserve one side of two data sets. A row can be returned from t
Figure 6-12 Exception join without RCAC permission
<!-- image -->
Given that row permission serves to eliminate logically rows from one or more sets, more rows can appear to be exceptions when RCAC is applied, as shown in Figure 6-13. Also, because column masks are applied after the query final results are determined, the masked value has no effect on the join processing and corresponding query result set.
Figure 6-13 Exception join with RCAC permission
<!-- image -->
## 6.4 Monitoring, analyzing, and debugging with RCAC
@ -2214,24 +2312,28 @@ Figure 6-14 shows how Visual Explain externalizes RCAC.
Figure 6-14 Visual Explain indicating that RCAC is applied
<!-- image -->
Figure 6-15 shows the main dashboard of an SQL Performance Monitor. Click Summary .
Figure 6-15 SQL Performance Monitor
<!-- image -->
Figure 6-16 shows the summary of an SQL Performance Monitor with an indication that RCAC is applied.
Figure 6-16 SQL Performance Monitor indicating that RCAC is applied
<!-- image -->
Figure 6-17 shows the statements of an SQL Performance Monitor and how RCAC is externalized.
Figure 6-17 SQL Performance Monitor showing statements and RCAC
<!-- image -->
When implementing RCAC as part of a comprehensive and pervasive data access control initiative, consider that the database monitoring and analysis tools can collect literal values that are passed as part of SQL statements. These literal values can be viewed as part of the information collected. If any of the literals are based on or are used with masked columns, it is important to review the database engineer's policy for viewing these data elements. For example, supposed that column CUSTOMER_TAX_ID is deemed masked for the database engineer and the CUSTOMER_TAX_ID column is used in a predicate as follows:
@ -2252,12 +2354,14 @@ For example, the query that is shown in Figure 6-18 produces index advice for th
Figure 6-18 Index advice and RCAC
<!-- image -->
In Figure 6-19, index advisor is showing an index for the ACCOUNTS and CUSTOMERS tables based on the RCAC rule text.
Figure 6-19 Index advisor based on the RCAC rule
<!-- image -->
For more information about creating and using indexes, see IBM DB2 for i indexing methods and strategies , found at:
@ -2352,12 +2456,14 @@ Any access to an SQL view that is over one or more tables that have RCAC also ha
Figure 6-21 View definition and user query
<!-- image -->
What the query optimizer plans for and what the database engine runs is shown in the Figure 6-22.
Figure 6-22 Query rewrite with RCAC
<!-- image -->
## 6.5.2 Materialized query tables
@ -2430,12 +2536,14 @@ For programs that access records sequentially, in or out of key order, the added
Figure 6-23 Native record access with no RCAC
<!-- image -->
Before the record, as identified by the key, is considered available, the RCAC logic must be run. If the record is rejected by RCAC, the next record in sequence that is permissible must be identified. This spinning through the records can take a long time and uses many resources, as shown in Figure 6-24.
Figure 6-24 Native record level access with RCAC
<!-- image -->
After the row permissions and column masks are designed and implemented, adequate performance and scalability testing are recommended.
@ -2544,6 +2652,7 @@ Figure 6-25 illustrates that object level security is the first check and that R
Figure 6-25 Object-level security and RCAC permissions
<!-- image -->
To get access to the table and the rows, the user must pass the object level authority test and the RCAC permission test.
@ -2616,6 +2725,7 @@ For example, assume that the BANKSCHEMA library (which is the system name or sho
Figure 7-1 Restoring tables to different schemas
<!-- image -->
The only way to fix this issue is to re-create the row permission or column mask after the restore operation. Re-creation of the row permission or column mask is required only for definitions that reference other DB2 objects, but it is simpler to re-create all of the RCAC definitions instead of a subset. For example, generate the SQL using System i Navigator, clear the "Schema qualify names for objects" and select the "OR REPLACE clause", and then run the generated script.

View File

@ -38,6 +38,7 @@
<caption><location><page_5><loc_22><loc_42><loc_34><loc_43></location>Figure 1 IBM z16</caption>
<figure>
<location><page_5><loc_22><loc_44><loc_71><loc_90></location>
<caption>Figure 1 IBM z16</caption>
</figure>
<section_header><location><page_5><loc_11><loc_38><loc_58><loc_40></location>IBM z16 and IBM LinuxONE Emperor 4 features</section_header>
<text><location><page_5><loc_22><loc_29><loc_89><loc_36></location>IBM Z are based on enterprise mainframe technology. Starting with transaction-based workloads and databases, IBM Z has undergone tremendous transformations in its system design for many generations to build servers that cater to Linux-based workloads and security with a cyberresilient system, and support quantum computing and modernization by using a hybrid cloud with a focus on data and AI.</text>
@ -45,12 +46,14 @@
<caption><location><page_6><loc_11><loc_51><loc_35><loc_52></location>Figure 2 IBM Z: Processor roadmap</caption>
<figure>
<location><page_6><loc_10><loc_53><loc_89><loc_86></location>
<caption>Figure 2 IBM Z: Processor roadmap</caption>
</figure>
<text><location><page_6><loc_22><loc_38><loc_89><loc_49></location>The IBM z16 and IBM LinuxONE Emperor 4 are the latest of the IBM Z, and they are developed with a 'built to build' focus to provide a powerful, cyberresilient, open, and secure platform for business with an extra focus on sustainability to help build sustainable data centers. Although the z16 server can host both IBM z/OSfi and Linux workloads, LinuxONE Emperor 4 is built to host Linux only workloads with a focus on consolidation and resiliency. Depending on the workload, consolidation from numerous x86 servers into a LinuxONE Emperor 4 can help reduce energy consumption by 75% and data center floor space by 50%, which helps to achieve the sustainability goals of the organization.</text>
<text><location><page_6><loc_22><loc_29><loc_89><loc_36></location>Figure 3 on page 5 shows a summary of the system design of IBM LinuxONE Emperor 4 with the IBM Telum™ processor. The IBM Telum processor chip is designed to run enterprise applications efficiently where their data resides to embed AI with super low latency. The support for higher bandwidth and I/O rates is supported through FCP Express cards with an endpoint security solution. The memory subsystem supports up to 40 TB of memory.</text>
<caption><location><page_7><loc_11><loc_54><loc_49><loc_56></location>Figure 3 System design of IBM z16 LinuxONE Emperor 4</caption>
<figure>
<location><page_7><loc_11><loc_56><loc_89><loc_90></location>
<caption>Figure 3 System design of IBM z16 LinuxONE Emperor 4</caption>
</figure>
<text><location><page_7><loc_22><loc_45><loc_89><loc_53></location>The IBM z16 and IBM LinuxONE Emperor 4 servers are built with 7-nm technology at a 5.2 GHz speed. They consist of four dual-chip modules (DCMs) per central processor complex (CPC) drawer, each of which is built with two 8-core Telum processor chips that has "first in the industry" on-chip acceleration for mid-transaction, real-time AI inferencing, which supports many different use cases, including fraud detection.</text>
<text><location><page_7><loc_22><loc_35><loc_89><loc_44></location>Each core has access to a huge private 32 MB L2 cache where up to 16 MB of the L2 cache of an inactive core can be used as virtual cache (L3 / L4) by neighboring active cores on the chip. This cache helps address translation and access checking by prefetching the same virtual cache into the L2 cache. The virtual cache also includes Neural Network Processing Assist instructions and direct memory access with protection, and per chip GZIP compression.</text>
@ -58,12 +61,14 @@
<caption><location><page_8><loc_10><loc_53><loc_63><loc_54></location>Figure 4 IBM z16 on-chip AI Accelerator integration with IBM Z processor cores</caption>
<figure>
<location><page_8><loc_11><loc_54><loc_90><loc_86></location>
<caption>Figure 4 IBM z16 on-chip AI Accelerator integration with IBM Z processor cores</caption>
</figure>
<text><location><page_8><loc_22><loc_41><loc_89><loc_51></location>The IBM z16 and IBM LinuxONE Emperor 4 server platforms are built with the hardware features that are shown in Figure 4 with addressing data and AI workloads in mind. Regardless of where the ML and deep learning (DL) frameworks are used to build and train data and AI models, the inferencing on existing enterprise application data can happen along currently running enterprise business applications. CP4D 4.6 supports Tensorflow and IBM Snap ML frameworks, which are optimized to use the on-chip AI Accelerator during inferencing. Support for various other frameworks is planned for future releases.</text>
<text><location><page_8><loc_22><loc_37><loc_89><loc_39></location>Figure 5 on page 7 shows the seamless integration of AI into existing enterprises workloads on the IBM z16 while leveraging the underlying hardware capabilities.</text>
<caption><location><page_9><loc_11><loc_61><loc_31><loc_62></location>Figure 5 Seamless integration</caption>
<figure>
<location><page_9><loc_10><loc_62><loc_89><loc_90></location>
<caption>Figure 5 Seamless integration</caption>
</figure>
<section_header><location><page_9><loc_11><loc_55><loc_56><loc_57></location>What is Cloud Pak for Data on IBM Z</section_header>
<text><location><page_9><loc_22><loc_47><loc_89><loc_53></location>IBM Cloud Pak for Data allows enterprises to simplify, unify, and automate the delivery of data and AI. It categorizes the activities within the journey to AI as four rungs of the AI Ladder: Collect, Organize, Analyze, and Infuse. For more information about each of the AI Ladder rungs, see Become Data Driven with IBM Z Infused Data Fabric , REDP-5680.</text>
@ -72,6 +77,7 @@
<caption><location><page_10><loc_11><loc_38><loc_43><loc_39></location>Figure 6 Solution overview of Cloud Pak for Data</caption>
<figure>
<location><page_10><loc_10><loc_39><loc_89><loc_77></location>
<caption>Figure 6 Solution overview of Cloud Pak for Data</caption>
</figure>
<text><location><page_10><loc_22><loc_35><loc_85><loc_36></location>We highlight the four main pillars that make IBM Z the correct infrastructure for CP4D:</text>
<list_item><location><page_10><loc_22><loc_33><loc_42><loc_34></location>GLYPH<SM590000> Performance and Scale</list_item>
@ -105,6 +111,7 @@
<caption><location><page_13><loc_10><loc_54><loc_83><loc_55></location>Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data on IBM Z and IBM LinuxONE</caption>
<figure>
<location><page_13><loc_10><loc_56><loc_89><loc_90></location>
<caption>Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data on IBM Z and IBM LinuxONE</caption>
</figure>
<text><location><page_13><loc_22><loc_51><loc_81><loc_53></location>In summary, here are some of the reasons why you should choose AI on IBM Z:</text>
<list_item><location><page_13><loc_22><loc_49><loc_68><loc_50></location>GLYPH<SM590000> World-class AI inference platform for enterprise workloads:</list_item>
@ -155,6 +162,7 @@
<caption><location><page_16><loc_10><loc_57><loc_34><loc_58></location>Figure 8 Typical AI model lifecycle</caption>
<figure>
<location><page_16><loc_10><loc_58><loc_89><loc_83></location>
<caption>Figure 8 Typical AI model lifecycle</caption>
</figure>
<text><location><page_16><loc_22><loc_46><loc_88><loc_55></location>Due to regulations, more stakeholders adopt the typical AI model lifecycle to protect their brand from new end-to-end risks. To ensure various aspects of both regulatory compliance and security, the personas that must be involved include the chief financial officer (CFO), chief marketing officer (CMO), chief data officer (CDO), HR, and chief regulatory officer (CRO), along with the data engineers, data scientists, and business analysts, who build AI workflows.</text>
<section_header><location><page_16><loc_11><loc_42><loc_46><loc_44></location>IBM governance solution for IBM Z</section_header>
@ -184,52 +192,62 @@
<caption><location><page_18><loc_11><loc_62><loc_48><loc_63></location>Figure 9 Remote AI governance solution end-to-end flow</caption>
<figure>
<location><page_18><loc_11><loc_63><loc_89><loc_90></location>
<caption>Figure 9 Remote AI governance solution end-to-end flow</caption>
</figure>
<text><location><page_18><loc_22><loc_59><loc_72><loc_60></location>To achieve end-to-end AI governance, complete the following steps:</text>
<list_item><location><page_18><loc_22><loc_55><loc_89><loc_58></location>1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.</list_item>
<caption><location><page_18><loc_10><loc_14><loc_46><loc_16></location>Figure 10 Creating a model entry in IBM OpenPages</caption>
<figure>
<location><page_18><loc_10><loc_16><loc_89><loc_53></location>
<caption>Figure 10 Creating a model entry in IBM OpenPages</caption>
</figure>
<list_item><location><page_19><loc_22><loc_87><loc_89><loc_91></location>2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.</list_item>
<caption><location><page_19><loc_11><loc_46><loc_47><loc_47></location>Figure 11 Training an AI model by using Watson Studio</caption>
<figure>
<location><page_19><loc_10><loc_48><loc_89><loc_85></location>
<caption>Figure 11 Training an AI model by using Watson Studio</caption>
</figure>
<list_item><location><page_19><loc_22><loc_42><loc_89><loc_45></location>3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.</list_item>
<caption><location><page_19><loc_11><loc_7><loc_57><loc_8></location>Figure 12 Deploying an AI model by using WML on Cloud Pak for Data</caption>
<figure>
<location><page_19><loc_11><loc_9><loc_90><loc_40></location>
<caption>Figure 12 Deploying an AI model by using WML on Cloud Pak for Data</caption>
</figure>
<list_item><location><page_20><loc_22><loc_85><loc_89><loc_91></location>4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.</list_item>
<caption><location><page_20><loc_22><loc_50><loc_40><loc_51></location>Figure 13 External model</caption>
<figure>
<location><page_20><loc_22><loc_51><loc_87><loc_83></location>
<caption>Figure 13 External model</caption>
</figure>
<text><location><page_20><loc_25><loc_45><loc_89><loc_48></location>You can track the model through each stage of the model lifecycle, as shown in Figure 14, by using AI Factsheets and OpenPages.</text>
<caption><location><page_20><loc_11><loc_9><loc_31><loc_10></location>Figure 14 Tracking the model</caption>
<figure>
<location><page_20><loc_10><loc_11><loc_90><loc_44></location>
<caption>Figure 14 Tracking the model</caption>
</figure>
<text><location><page_21><loc_25><loc_88><loc_89><loc_91></location>You can see that the model facts are tracked and synchronized to IBM OpenPages for risk management, as shown in Figure 15.</text>
<caption><location><page_21><loc_10><loc_46><loc_74><loc_48></location>Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform</caption>
<figure>
<location><page_21><loc_10><loc_48><loc_89><loc_86></location>
<caption>Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform</caption>
</figure>
<list_item><location><page_22><loc_22><loc_88><loc_86><loc_91></location>5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.</list_item>
<caption><location><page_22><loc_11><loc_50><loc_48><loc_52></location>Figure 16 Creating an external model on an x86 platform</caption>
<figure>
<location><page_22><loc_10><loc_52><loc_89><loc_86></location>
<caption>Figure 16 Creating an external model on an x86 platform</caption>
</figure>
<text><location><page_22><loc_22><loc_43><loc_89><loc_49></location>IBM OpenScale provides a comprehensive dashboard that tracks fairness, quality monitoring, drift, and explainability of a model. Fairness determines whether your model produces biased outcomes. Quality determines how well your model predicts outcomes. Drift is the degradation of predictive performance over time. A sample is shown in Figure 17 on page 21.</text>
<caption><location><page_23><loc_11><loc_54><loc_63><loc_55></location>Figure 17 IBM OpenScale dashboard that is used to monitor the external model</caption>
<figure>
<location><page_23><loc_10><loc_56><loc_89><loc_90></location>
<caption>Figure 17 IBM OpenScale dashboard that is used to monitor the external model</caption>
</figure>
<text><location><page_23><loc_22><loc_45><loc_89><loc_53></location>You developed and deployed the AI model by using Watson Studio, WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, and end-to-end AI model governance by leveraging AI Factsheets, OpenScale, and OpenPages on CP4D on a x86 platform. Figure 18 shows end-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale.</text>
<caption><location><page_23><loc_11><loc_7><loc_83><loc_8></location>Figure 18 Final result: End-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale</caption>
<figure>
<location><page_23><loc_10><loc_9><loc_90><loc_44></location>
<caption>Figure 18 Final result: End-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale</caption>
</figure>
<section_header><location><page_24><loc_11><loc_89><loc_64><loc_91></location>Use case 2: Credit default risk assessment</section_header>
<text><location><page_24><loc_22><loc_83><loc_89><loc_87></location>In today's world, many individuals or businesses seeking loans to meet their growing business needs often look to financial institutions. Financial institutions can offer loans to individuals or businesses and charge interest based on the current market situations.</text>
@ -242,6 +260,7 @@
<caption><location><page_25><loc_10><loc_55><loc_65><loc_57></location>Figure 19 Architecture for credit risk prediction by using an ML AI model on IBM Z</caption>
<figure>
<location><page_25><loc_11><loc_57><loc_89><loc_90></location>
<caption>Figure 19 Architecture for credit risk prediction by using an ML AI model on IBM Z</caption>
</figure>
<text><location><page_25><loc_22><loc_48><loc_89><loc_54></location>A data scientist can leverage Watson Studio to develop and train an AI model and WML to deploy and score the model. In this sample architecture, the WML Python run time leverages the ML framework, IBM Snap Machine Learning (Snap ML), for scoring, can leverage an integrated AI accelerator at the time of model import.</text>
<text><location><page_25><loc_22><loc_39><loc_89><loc_47></location>Then, the banking loan approval team can send a loan applicant request to the IBM WebSphere Application Server, which can make a request to the AI inference endpoint. The AI inference engine scores the transaction and sends the result back to the loan approval team. Based on the results, the approval team can decide on whether to approve a loan or not, and also decide how much they can lend, timelines, and other factors.</text>
@ -252,6 +271,7 @@
<caption><location><page_26><loc_11><loc_53><loc_56><loc_54></location>Figure 20 Architecture for credit risk prediction by using DL on IBM Z</caption>
<figure>
<location><page_26><loc_11><loc_55><loc_89><loc_88></location>
<caption>Figure 20 Architecture for credit risk prediction by using DL on IBM Z</caption>
</figure>
<text><location><page_26><loc_22><loc_46><loc_87><loc_52></location>Data scientists can start creating and training a DL AI model by using a Jupyter Notebook instance and Watson Studio. Then, they can deploy the model by using WML on CP4D running on IBM Z, which provides an endpoint. Other applications, including the IBM WebSphere server, can produce credit risk results by using the model's endpoint.</text>
<text><location><page_26><loc_22><loc_42><loc_89><loc_44></location>In summary, here are some considerations for developing real-time AI models, such as credit risk assessment:</text>
@ -273,6 +293,7 @@
<caption><location><page_28><loc_10><loc_59><loc_75><loc_60></location>Figure 21 Clearing and settlement use case for financial transactions by using Cloud Pak for Data</caption>
<figure>
<location><page_28><loc_10><loc_61><loc_89><loc_86></location>
<caption>Figure 21 Clearing and settlement use case for financial transactions by using Cloud Pak for Data</caption>
</figure>
<text><location><page_28><loc_22><loc_56><loc_58><loc_57></location>Here are the steps of the high-level process flow:</text>
<list_item><location><page_28><loc_22><loc_53><loc_86><loc_55></location>1. Create a connection to a database (for example, an IBM Db2fi database) where the historical data will be used for ML model building.</list_item>
@ -304,6 +325,7 @@
<caption><location><page_29><loc_11><loc_20><loc_40><loc_22></location>Figure 22 Inferencing architecture on IBM Z</caption>
<figure>
<location><page_29><loc_10><loc_22><loc_88><loc_52></location>
<caption>Figure 22 Inferencing architecture on IBM Z</caption>
</figure>
<text><location><page_29><loc_22><loc_8><loc_89><loc_19></location>Because we are looking into data-driven model development, the data set of our target is the run-to-failure data of the engine. We are looking into a supervised learning problem, and we use regression techniques to learn from the data. DL techniques such as Long Short-Term Memory (LSTM) or Gated Recurrent Units (GRU) are our choice because we are looking into a time series data set. TensorFlow or PyTorch frameworks are leveraged to create models. AI governance monitors the data and model drift to maintain the model quality throughout the model's life.</text>
<text><location><page_30><loc_22><loc_78><loc_89><loc_91></location>Open-source data from NASA was used to build the AI model, which then was deployed on CP4D. CP4D enables the data-scientist's journey from modeling to deployment in a seamless process. Data engineers leverage Db2 to host the data set, which includes the training, testing, and validation of a data set. Since data is hosted on Db2, you can expect low latency while retrieving the data and serve data security needs because Db2 is hosted on the IBM Z platform. Data is fetched by the data refinery to do the necessary pre-processing and data imputations. You can use the programming languages Golang or C++ for real-time predictions, depending on customer needs. For more information about this topic, see "Use case 3: Clearing and settlement" on page 25.</text>
@ -317,6 +339,7 @@
<caption><location><page_31><loc_11><loc_43><loc_35><loc_44></location>Figure 23 In-depth architectural view</caption>
<figure>
<location><page_31><loc_10><loc_45><loc_90><loc_90></location>
<caption>Figure 23 In-depth architectural view</caption>
</figure>
<text><location><page_31><loc_22><loc_39><loc_82><loc_41></location>In summary, consider the following points while developing an AI-based predictive maintenance application:</text>
<list_item><location><page_31><loc_22><loc_33><loc_89><loc_38></location>GLYPH<SM590000> CP4D offers a Python run time to build a custom solution stack, but also supports different components like Watson Studio, WML, Db2, Data Refinery, OpenScale, AI Factsheets, and OpenPages.</list_item>
@ -341,6 +364,7 @@
<caption><location><page_33><loc_11><loc_47><loc_46><loc_48></location>Figure 24 Architecture for AI-powered video analytics</caption>
<figure>
<location><page_33><loc_10><loc_48><loc_89><loc_79></location>
<caption>Figure 24 Architecture for AI-powered video analytics</caption>
</figure>
<text><location><page_33><loc_22><loc_35><loc_89><loc_45></location>Live camera feeds or recorded videos of an infant's movement are the inputs for a pose detection model. This video streaming data was stored in IBM Cloudfi Object Storage for image processing. Video data must be transformed into frames so that the infant's body poses can be detected. These post-estimation components of the pipeline predict the location of all 17-person key points with 3 degrees of freedom each (x, y location and visibility) plus two virtual alignment key points. This approach also embraces a compute-intensive heat map prediction of infant body posture.</text>
<text><location><page_33><loc_22><loc_24><loc_88><loc_33></location>When changes in body posture or movement happen, analytics can be performed, and a threshold can be set for the angle of the body and posture movements. An analysis can be performed on movement that is based on that threshold to help to predict an infant's health index in the output video stream by leveraging the IBM z16 on-chip AI acceleration, which provides an execution speed in real time on an edge device, which cannot be achieved by other means.</text>

File diff suppressed because one or more lines are too long

View File

@ -56,6 +56,7 @@ Figure 1 on page 3 shows a picture of the IBM z16 mainframe.
Figure 1 IBM z16
<!-- image -->
## IBM z16 and IBM LinuxONE Emperor 4 features
@ -66,6 +67,7 @@ Figure 2 provides a snapshot of the IBM Z processor roadmap, which depicts the j
Figure 2 IBM Z: Processor roadmap
<!-- image -->
The IBM z16 and IBM LinuxONE Emperor 4 are the latest of the IBM Z, and they are developed with a 'built to build' focus to provide a powerful, cyberresilient, open, and secure platform for business with an extra focus on sustainability to help build sustainable data centers. Although the z16 server can host both IBM z/OSfi and Linux workloads, LinuxONE Emperor 4 is built to host Linux only workloads with a focus on consolidation and resiliency. Depending on the workload, consolidation from numerous x86 servers into a LinuxONE Emperor 4 can help reduce energy consumption by 75% and data center floor space by 50%, which helps to achieve the sustainability goals of the organization.
@ -74,6 +76,7 @@ Figure 3 on page 5 shows a summary of the system design of IBM LinuxONE Emperor
Figure 3 System design of IBM z16 LinuxONE Emperor 4
<!-- image -->
The IBM z16 and IBM LinuxONE Emperor 4 servers are built with 7-nm technology at a 5.2 GHz speed. They consist of four dual-chip modules (DCMs) per central processor complex (CPC) drawer, each of which is built with two 8-core Telum processor chips that has "first in the industry" on-chip acceleration for mid-transaction, real-time AI inferencing, which supports many different use cases, including fraud detection.
@ -84,6 +87,7 @@ Figure 4 provides more information about the features of AI Accelerator integrat
Figure 4 IBM z16 on-chip AI Accelerator integration with IBM Z processor cores
<!-- image -->
The IBM z16 and IBM LinuxONE Emperor 4 server platforms are built with the hardware features that are shown in Figure 4 with addressing data and AI workloads in mind. Regardless of where the ML and deep learning (DL) frameworks are used to build and train data and AI models, the inferencing on existing enterprise application data can happen along currently running enterprise business applications. CP4D 4.6 supports Tensorflow and IBM Snap ML frameworks, which are optimized to use the on-chip AI Accelerator during inferencing. Support for various other frameworks is planned for future releases.
@ -92,6 +96,7 @@ Figure 5 on page 7 shows the seamless integration of AI into existing enterprise
Figure 5 Seamless integration
<!-- image -->
## What is Cloud Pak for Data on IBM Z
@ -104,6 +109,7 @@ Figure 6 shows a solution overview of CP4D. The infrastructure alternatives are
Figure 6 Solution overview of Cloud Pak for Data
<!-- image -->
We highlight the four main pillars that make IBM Z the correct infrastructure for CP4D:
@ -166,6 +172,7 @@ Figure 7 on page 11 provides an overview of the components that are supported on
Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data on IBM Z and IBM LinuxONE
<!-- image -->
In summary, here are some of the reasons why you should choose AI on IBM Z:
@ -262,6 +269,7 @@ For example, a business can start testing a model before production for fairness
Figure 8 Typical AI model lifecycle
<!-- image -->
Due to regulations, more stakeholders adopt the typical AI model lifecycle to protect their brand from new end-to-end risks. To ensure various aspects of both regulatory compliance and security, the personas that must be involved include the chief financial officer (CFO), chief marketing officer (CMO), chief data officer (CDO), HR, and chief regulatory officer (CRO), along with the data engineers, data scientists, and business analysts, who build AI workflows.
@ -316,6 +324,7 @@ Figure 9 on page 16 shows the end-to-end flow for a remote AI governance solutio
Figure 9 Remote AI governance solution end-to-end flow
<!-- image -->
To achieve end-to-end AI governance, complete the following steps:
@ -324,54 +333,63 @@ To achieve end-to-end AI governance, complete the following steps:
Figure 10 Creating a model entry in IBM OpenPages
<!-- image -->
2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.
Figure 11 Training an AI model by using Watson Studio
<!-- image -->
3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.
Figure 12 Deploying an AI model by using WML on Cloud Pak for Data
<!-- image -->
4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.
Figure 13 External model
<!-- image -->
You can track the model through each stage of the model lifecycle, as shown in Figure 14, by using AI Factsheets and OpenPages.
Figure 14 Tracking the model
<!-- image -->
You can see that the model facts are tracked and synchronized to IBM OpenPages for risk management, as shown in Figure 15.
Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform
<!-- image -->
5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.
Figure 16 Creating an external model on an x86 platform
<!-- image -->
IBM OpenScale provides a comprehensive dashboard that tracks fairness, quality monitoring, drift, and explainability of a model. Fairness determines whether your model produces biased outcomes. Quality determines how well your model predicts outcomes. Drift is the degradation of predictive performance over time. A sample is shown in Figure 17 on page 21.
Figure 17 IBM OpenScale dashboard that is used to monitor the external model
<!-- image -->
You developed and deployed the AI model by using Watson Studio, WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, and end-to-end AI model governance by leveraging AI Factsheets, OpenScale, and OpenPages on CP4D on a x86 platform. Figure 18 shows end-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale.
Figure 18 Final result: End-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale
<!-- image -->
## Use case 2: Credit default risk assessment
@ -392,6 +410,7 @@ Figure 19 on page 23 shows a sample architecture about how to design and develop
Figure 19 Architecture for credit risk prediction by using an ML AI model on IBM Z
<!-- image -->
A data scientist can leverage Watson Studio to develop and train an AI model and WML to deploy and score the model. In this sample architecture, the WML Python run time leverages the ML framework, IBM Snap Machine Learning (Snap ML), for scoring, can leverage an integrated AI accelerator at the time of model import.
@ -408,6 +427,7 @@ Figure 20 shows an architecture for predicting credit risk by using DL on IBM Z.
Figure 20 Architecture for credit risk prediction by using DL on IBM Z
<!-- image -->
Data scientists can start creating and training a DL AI model by using a Jupyter Notebook instance and Watson Studio. Then, they can deploy the model by using WML on CP4D running on IBM Z, which provides an endpoint. Other applications, including the IBM WebSphere server, can produce credit risk results by using the model's endpoint.
@ -446,6 +466,7 @@ Figure 21 provides a high-level diagram of a clearing and settlement use case fo
Figure 21 Clearing and settlement use case for financial transactions by using Cloud Pak for Data
<!-- image -->
Here are the steps of the high-level process flow:
@ -504,6 +525,7 @@ Figure 22 provides an overview of the inferencing architecture for the RUL of an
Figure 22 Inferencing architecture on IBM Z
<!-- image -->
Because we are looking into data-driven model development, the data set of our target is the run-to-failure data of the engine. We are looking into a supervised learning problem, and we use regression techniques to learn from the data. DL techniques such as Long Short-Term Memory (LSTM) or Gated Recurrent Units (GRU) are our choice because we are looking into a time series data set. TensorFlow or PyTorch frameworks are leveraged to create models. AI governance monitors the data and model drift to maintain the model quality throughout the model's life.
@ -526,6 +548,7 @@ Figure 23 on page 29 provides a more in-depth view of the architecture of an AI-
Figure 23 In-depth architectural view
<!-- image -->
In summary, consider the following points while developing an AI-based predictive maintenance application:
@ -570,6 +593,7 @@ Figure 24 shows an architectural diagram about how to design and develop an AI m
Figure 24 Architecture for AI-powered video analytics
<!-- image -->
Live camera feeds or recorded videos of an infant's movement are the inputs for a pose detection model. This video streaming data was stored in IBM Cloudfi Object Storage for image processing. Video data must be transformed into frames so that the infant's body poses can be detected. These post-estimation components of the pipeline predict the location of all 17-person key points with 3 degrees of freedom each (x, y location and visibility) plus two virtual alignment key points. This approach also embraces a compute-intensive heat map prediction of infant body posture.

View File

@ -8,8 +8,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE_V1 = False
GENERATE_V2 = False
GENERATE_V1 = True
GENERATE_V2 = True
def get_pdf_paths():

View File

@ -29,7 +29,7 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
with open(dict_fn, "w") as fd:
json.dump(doc_result.render_as_dict(), fd)
json.dump(doc_result.legacy_document.export_to_dict(), fd)
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
pages = [p.model_dump() for p in doc_result.pages]
@ -38,11 +38,11 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd:
fd.write(doc_result.render_as_doctags())
fd.write(doc_result.legacy_document.export_to_doctags())
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd:
fd.write(doc_result.render_as_markdown())
fd.write(doc_result.legacy_document.export_to_markdown())
def get_pdf_paths():

View File

@ -10,6 +10,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE = True
def get_pdf_path():
@ -42,8 +44,12 @@ def test_convert_path(converter: DocumentConverter):
print(f"converting {pdf_path}")
doc_result = converter.convert(pdf_path)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v1(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE
)
verify_conversion_result_v2(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE
)
def test_convert_stream(converter: DocumentConverter):
@ -55,5 +61,9 @@ def test_convert_stream(converter: DocumentConverter):
stream = DocumentStream(name=pdf_path.name, stream=buf)
doc_result = converter.convert(stream)
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
verify_conversion_result_v1(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE
)
verify_conversion_result_v2(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE
)

View File

@ -0,0 +1,53 @@
import json
from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture
def test_doc_paths():
return [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
]
def get_converter():
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
return converter
def test_compare_legacy_output(test_doc_paths):
converter = get_converter()
res = converter.convert_all(test_doc_paths, raises_on_error=True)
for conv_res in res:
print(f"Results for {conv_res.input.file}")
print(
json.dumps(
conv_res.legacy_document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
)
)
# assert res.legacy_output == res.legacy_output_transformed

View File

@ -131,6 +131,10 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
assert true_item.data is not None, "documents are expected to have table data"
assert pred_item.data is not None, "documents are expected to have table data"
print("True: \n", true_item.export_to_dataframe().to_markdown())
print("Pred: \n", true_item.export_to_dataframe().to_markdown())
for i, row in enumerate(true_item.data):
for j, col in enumerate(true_item.data[i]):
@ -175,6 +179,10 @@ def verify_tables_v2(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy
assert true_item.data is not None, "documents are expected to have table data"
assert pred_item.data is not None, "documents are expected to have table data"
print("True: \n", true_item.export_to_dataframe().to_markdown())
print("Pred: \n", true_item.export_to_dataframe().to_markdown())
for i, row in enumerate(true_item.data.grid):
for j, col in enumerate(true_item.data.grid[i]):
@ -234,11 +242,11 @@ def verify_conversion_result_v1(
), f"Doc {input_path} did not convert successfully."
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_output
doc_pred: DsDocument = doc_result.legacy_document
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
doc_pred_md = doc_result.render_as_markdown()
doc_pred_dt = doc_result.render_as_doctags()
doc_pred_md = doc_result.legacy_document.export_to_markdown()
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
@ -308,9 +316,9 @@ def verify_conversion_result_v2(
), f"Doc {input_path} did not convert successfully."
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DoclingDocument = doc_result.output
doc_pred_md = doc_result.output.export_to_markdown()
doc_pred_dt = doc_result.output.export_to_document_tokens()
doc_pred: DoclingDocument = doc_result.document
doc_pred_md = doc_result.document.export_to_markdown()
doc_pred_dt = doc_result.document.export_to_document_tokens()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name