mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
update with improved docling-core
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
ddb20be002
commit
1571e1e17d
37
poetry.lock
generated
37
poetry.lock
generated
@ -957,13 +957,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "1.6.0"
|
||||
version = "1.6.2"
|
||||
description = "A python library to define and validate data types in Docling."
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "docling_core-1.6.0-py3-none-any.whl", hash = "sha256:a947a6585377ad9b74484adbe541383e0c3d55bf31176faef2fd72560fee24f0"},
|
||||
{file = "docling_core-1.6.0.tar.gz", hash = "sha256:e3325c12948f5ef426b8862189690db833dd49b0b335ffa399fcf2b54fbf2b44"},
|
||||
{file = "docling_core-1.6.2-py3-none-any.whl", hash = "sha256:1473ab13910d76552015c10fe351b90079a00c225f76ada3cd4fc7442183ffd0"},
|
||||
{file = "docling_core-1.6.2.tar.gz", hash = "sha256:63f2b8a683dec56568ee1cd7d25cea419c0291211a88a11f74079ff2d62ccd5e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -972,7 +972,6 @@ jsonref = ">=1.1.0,<2.0.0"
|
||||
jsonschema = ">=4.16.0,<5.0.0"
|
||||
pandas = ">=2.2.2,<3.0.0"
|
||||
pydantic = ">=2.6.0,<3.0.0"
|
||||
pyproject-toml = ">=0.0.10,<0.0.11"
|
||||
tabulate = ">=0.9.0,<0.10.0"
|
||||
|
||||
[[package]]
|
||||
@ -4481,23 +4480,6 @@ files = [
|
||||
flake8 = "6.1.0"
|
||||
tomli = {version = "*", markers = "python_version < \"3.11\""}
|
||||
|
||||
[[package]]
|
||||
name = "pyproject-toml"
|
||||
version = "0.0.10"
|
||||
description = "Project intend to implement PEP 517, 518, 621, 631 and so on."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pyproject-toml-0.0.10.tar.gz", hash = "sha256:f0ce0e9934ecb00c0e529b4a1c380edd3034c4be65516769c5f080bdb23dfcb3"},
|
||||
{file = "pyproject_toml-0.0.10-py3-none-any.whl", hash = "sha256:257a7070617e1a0bcfd8f790817b30bd9193876023a9b9e7a6b4fc976acf4c3e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
jsonschema = "*"
|
||||
setuptools = ">=42"
|
||||
toml = "*"
|
||||
wheel = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pyreadline3"
|
||||
version = "3.5.3"
|
||||
@ -6256,17 +6238,6 @@ dev = ["tokenizers[testing]"]
|
||||
docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
|
||||
testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.10.2"
|
||||
description = "Python Library for Tom's Obvious, Minimal Language"
|
||||
optional = false
|
||||
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||
files = [
|
||||
{file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
|
||||
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.0.1"
|
||||
@ -7257,4 +7228,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "ecdd1d482db8b0bf76fa72b791048197a745ccdbbe7fdaa5a6f1e40306115166"
|
||||
content-hash = "d6ede0493d8d2d0e250ba391d9ad32ced98541fbd4795b2b955d6f640736b3bc"
|
||||
|
@ -23,7 +23,7 @@ packages = [{include = "docling"}]
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = "^1.6.0"
|
||||
docling-core = "^1.6.2"
|
||||
docling-ibm-models = "^1.2.0"
|
||||
deepsearch-glm = "^0.21.1"
|
||||
filetype = "^1.2.0"
|
||||
|
@ -14,18 +14,19 @@ The occurrence of tables in documents is ubiquitous. They often summarise quanti
|
||||
|
||||
Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
|
||||
|
||||
Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
|
||||
| | 3 | 1 |
|
||||
|----|-----|-----|
|
||||
| 2 | | |
|
||||
|
||||
b. Red-annotation of bounding boxes, Blue-predictions by TableFormer
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
c. Structure predicted by TableFormer:
|
||||
|
||||
Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.
|
||||
|
||||
| 0 | 1 | 1 | 2 1 | 2 1 | |
|
||||
|-----|-----|-----|-------|-------|----|
|
||||
| 3 | 4 | 5 3 | 6 | 7 | |
|
||||
@ -78,9 +79,7 @@ Hybrid Deep Learning-Rule-Based approach : A popular current model for table-str
|
||||
We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-
|
||||
|
||||
Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets
|
||||
|
||||
<!-- image -->
|
||||
Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets
|
||||
|
||||
balance in the previous datasets.
|
||||
|
||||
@ -99,7 +98,6 @@ Motivated by those observations we aimed at generating a synthetic table dataset
|
||||
In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third
|
||||
|
||||
Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.
|
||||
|
||||
| | Tags | Bbox | Size | Format |
|
||||
|--------------------|--------|--------|--------|----------|
|
||||
| PubTabNet | 3 | 3 | 509k | PNG |
|
||||
@ -124,14 +122,10 @@ We now describe in detail the proposed method, which is composed of three main c
|
||||
CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-
|
||||
|
||||
Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.
|
||||
|
||||
<!-- image -->
|
||||
Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.
|
||||
|
||||
Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.
|
||||
|
||||
<!-- image -->
|
||||
Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.
|
||||
|
||||
forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .
|
||||
|
||||
@ -186,7 +180,6 @@ where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDi
|
||||
Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.
|
||||
|
||||
Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
|
||||
|
||||
| Model | Dataset | Simple | TEDS Complex | All |
|
||||
|-------------|-----------|----------|----------------|-------|
|
||||
| EDD | PTN | 91.1 | 88.7 | 89.9 |
|
||||
@ -207,7 +200,6 @@ Cell Detection. Like any object detector, our Cell BBox Detector provides boundi
|
||||
our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.
|
||||
|
||||
Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.
|
||||
|
||||
| Model | Dataset | mAP | mAP (PP) |
|
||||
|-------------|-------------|-------|------------|
|
||||
| EDD+BBox | PubTabNet | 79.2 | 82.7 |
|
||||
@ -217,7 +209,6 @@ Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Po
|
||||
Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.
|
||||
|
||||
Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.
|
||||
|
||||
| Model | Simple | TEDS Complex | All |
|
||||
|-------------|----------|----------------|-------|
|
||||
| Tabula | 78 | 57.8 | 67.9 |
|
||||
@ -233,12 +224,15 @@ Japanese language (previously unseen by TableFormer):
|
||||
|
||||
Example table from FinTabNet:
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
|
||||
|
||||
|
||||
| | | 論文ファイル | 論文ファイル | 参考文献 | 参考文献 |
|
||||
|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
|
||||
| 出典 | ファイル 数 | 英語 | 日本語 | 英語 | 日本語 |
|
||||
@ -252,7 +246,6 @@ b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
|
||||
| | 945 | 294 | 651 | 1122 | 955 |
|
||||
|
||||
Text is aligned to match original for ease of viewing
|
||||
|
||||
| | Shares (in millions) | Shares (in millions) | Weighted Average Grant Date Fair Value | Weighted Average Grant Date Fair Value |
|
||||
|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
|
||||
| | RS U s | PSUs | RSUs | PSUs |
|
||||
@ -263,16 +256,13 @@ Text is aligned to match original for ease of viewing
|
||||
| Nonvested on December 31 | 1.0 | 0.3 | 104.85 $ | $ 104.51 |
|
||||
|
||||
Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
|
||||
|
||||
<!-- image -->
|
||||
Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
|
||||
|
||||
<!-- image -->
|
||||
Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
|
||||
|
||||
## 5.5. Qualitative Analysis
|
||||
|
||||
@ -403,9 +393,7 @@ The process of generating a synthetic dataset can be decomposed into the followi
|
||||
Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:
|
||||
|
||||
Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.
|
||||
|
||||
<!-- image -->
|
||||
Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.
|
||||
|
||||
· TableFormer output does not include the table cell content.
|
||||
|
||||
@ -458,47 +446,33 @@ Aditional images with examples of TableFormer predictions and post-processing ca
|
||||
Figure 8: Example of a table with multi-line header.
|
||||
|
||||
Figure 9: Example of a table with big empty distance between cells.
|
||||
|
||||
<!-- image -->
|
||||
Figure 9: Example of a table with big empty distance between cells.
|
||||
|
||||
Figure 10: Example of a complex table with empty cells.
|
||||
|
||||
<!-- image -->
|
||||
Figure 10: Example of a complex table with empty cells.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 11: Simple table with different style and empty cells.
|
||||
|
||||
<!-- image -->
|
||||
Figure 11: Simple table with different style and empty cells.
|
||||
|
||||
Figure 12: Simple table predictions and post processing.
|
||||
|
||||
<!-- image -->
|
||||
Figure 12: Simple table predictions and post processing.
|
||||
|
||||
Figure 13: Table predictions example on colorful table.
|
||||
|
||||
Figure 14: Example with multi-line text.
|
||||
|
||||
<!-- image -->
|
||||
Figure 14: Example with multi-line text.
|
||||
|
||||
Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
|
||||
|
||||
<!-- image -->
|
||||
Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 15: Example with triangular table.
|
||||
|
||||
<!-- image -->
|
||||
Figure 15: Example with triangular table.
|
||||
|
||||
Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.
|
||||
|
||||
<!-- image -->
|
||||
Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.
|
@ -23,9 +23,7 @@ Permission to make digital or hard copies of part or all of this work for person
|
||||
KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043
|
||||
|
||||
Figure 1: Four examples of complex page layouts across different document categories
|
||||
|
||||
<!-- image -->
|
||||
Figure 1: Four examples of complex page layouts across different document categories
|
||||
|
||||
## KEYWORDS
|
||||
|
||||
@ -72,9 +70,7 @@ DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of hum
|
||||
In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents
|
||||
|
||||
Figure 2: Distribution of DocLayNet pages across document categories.
|
||||
|
||||
<!-- image -->
|
||||
Figure 2: Distribution of DocLayNet pages across document categories.
|
||||
|
||||
to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".
|
||||
|
||||
@ -95,7 +91,6 @@ Despite being cost-intense and far less scalable than automation, human annotati
|
||||
The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,
|
||||
|
||||
Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.
|
||||
|
||||
| | | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |
|
||||
|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|
|
||||
| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |
|
||||
@ -113,9 +108,7 @@ Table 1: DocLayNet dataset overview. Along with the frequency of each class labe
|
||||
| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |
|
||||
|
||||
Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.
|
||||
|
||||
<!-- image -->
|
||||
Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.
|
||||
|
||||
we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.
|
||||
|
||||
@ -150,9 +143,7 @@ The complete annotation guideline is over 100 pages long and a detailed descript
|
||||
Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations
|
||||
|
||||
Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
|
||||
|
||||
<!-- image -->
|
||||
Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
|
||||
|
||||
were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.
|
||||
|
||||
@ -160,6 +151,7 @@ Phase 4: Production annotation. The previously selected 80K pages were annotated
|
||||
|
||||
Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
|
||||
|
||||
Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
|
||||
| | human | MRCNN | MRCNN | FRCNN | YOLO |
|
||||
|----------------|---------|---------|---------|---------|--------|
|
||||
| | human | R50 | R101 | R101 | v5x6 |
|
||||
@ -183,9 +175,7 @@ to avoid this at any cost in order to have clear, unbiased baseline numbers for
|
||||
The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this
|
||||
|
||||
Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.
|
||||
|
||||
<!-- image -->
|
||||
Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.
|
||||
|
||||
paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.
|
||||
|
||||
@ -196,7 +186,6 @@ In this section, we will present several aspects related to the performance of o
|
||||
In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.
|
||||
|
||||
Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.
|
||||
|
||||
| Class-count | 11 | 6 | 5 | 4 |
|
||||
|----------------|------|---------|---------|---------|
|
||||
| Caption | 68 | Text | Text | Text |
|
||||
@ -221,7 +210,6 @@ One of the fundamental questions related to any dataset is if it is "large enoug
|
||||
The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of
|
||||
|
||||
Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.
|
||||
|
||||
| Class-count | 11 | 11 | 5 | 5 |
|
||||
|----------------|------|------|-----|------|
|
||||
| Split | Doc | Page | Doc | Page |
|
||||
@ -250,6 +238,7 @@ Throughout this paper, we claim that DocLayNet's wider variety of document layou
|
||||
|
||||
Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
|
||||
|
||||
Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
|
||||
| | Testing on | Testing on | Testing on |
|
||||
|------------|--------------|--------------|--------------|
|
||||
| labels | PLN | DB | DLN |
|
||||
@ -312,9 +301,7 @@ To date, there is still a significant gap between human and ML accuracy on the l
|
||||
[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu
|
||||
|
||||
Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.
|
||||
|
||||
<!-- image -->
|
||||
Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.
|
||||
|
||||
Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.
|
||||
|
||||
|
@ -5,7 +5,6 @@ order to compute the TED score. Inference timing results for all experiments wer
|
||||
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
|
||||
|
||||
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
|
||||
|
||||
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|
||||
|------------|------------|------------|-------------|-------------------|-------------|-------------|-------------|
|
||||
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
|
||||
|
@ -16,8 +16,8 @@ In modern document understanding systems [1,15], table extraction is typically a
|
||||
|
||||
Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).
|
||||
|
||||
<!-- image -->
|
||||
Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).
|
||||
<!-- image -->
|
||||
|
||||
today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].
|
||||
|
||||
@ -46,9 +46,7 @@ All known Im2Seq based models for TSR fundamentally work in similar ways. Given
|
||||
ulary and can be interpreted as a table structure. For example, with the HTML tokens <table> , </table> , <tr> , </tr> , <td> and </td> , one can construct simple table structures without any spanning cells. In reality though, one needs at least 28 HTML tokens to describe the most common complex tables observed in real-world documents [21,22], due to a variety of spanning cells definitions in the HTML token vocabulary.
|
||||
|
||||
Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.
|
||||
|
||||
<!-- image -->
|
||||
Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.
|
||||
|
||||
Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.
|
||||
|
||||
@ -83,9 +81,7 @@ The OTSL vocabulary is comprised of the following tokens:
|
||||
A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.
|
||||
|
||||
Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding
|
||||
|
||||
<!-- image -->
|
||||
Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding
|
||||
|
||||
## 4.2 Language Syntax
|
||||
|
||||
@ -120,9 +116,7 @@ The design of OTSL allows to validate a table structure easily on an unfinished
|
||||
To evaluate the impact of OTSL on prediction accuracy and inference times, we conducted a series of experiments based on the TableFormer model (Figure 4) with two objectives: Firstly we evaluate the prediction quality and performance of OTSL vs. HTML after performing Hyper Parameter Optimization (HPO) on the canonical PubTabNet data set. Secondly we pick the best hyper-parameters found in the first step and evaluate how OTSL impacts the performance of TableFormer after training on other publicly available data sets (FinTabNet, PubTables-1M [14]). The ground truth (GT) from all data sets has been converted into OTSL format for this purpose, and will be made publicly available.
|
||||
|
||||
Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.
|
||||
|
||||
<!-- image -->
|
||||
Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.
|
||||
|
||||
We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in
|
||||
|
||||
@ -133,7 +127,6 @@ order to compute the TED score. Inference timing results for all experiments wer
|
||||
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
|
||||
|
||||
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
|
||||
|
||||
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|
||||
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
|
||||
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
|
||||
@ -150,7 +143,6 @@ We picked the model parameter configuration that produced the best prediction qu
|
||||
Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.
|
||||
|
||||
Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).
|
||||
|
||||
| | Language | TEDs | TEDs | TEDs | mAP(0.75) | Inference time (secs) |
|
||||
|--------------|------------|--------|---------|--------|-------------|-------------------------|
|
||||
| | Language | simple | complex | all | mAP(0.75) | Inference time (secs) |
|
||||
@ -166,18 +158,14 @@ Table 2. TSR and cell detection results compared between OTSL and HTML on the Pu
|
||||
To illustrate the qualitative differences between OTSL and HTML, Figure 5 demonstrates less overlap and more accurate bounding boxes with OTSL. In Figure 6, OTSL proves to be more effective in handling tables with longer token sequences, resulting in even more precise structure prediction and bounding boxes.
|
||||
|
||||
Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ
|
||||
|
||||
<!-- image -->
|
||||
Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ
|
||||
|
||||
μ
|
||||
|
||||
≥
|
||||
|
||||
Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.
|
||||
|
||||
<!-- image -->
|
||||
Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.
|
||||
|
||||
## 6 Conclusion
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,11 +1,14 @@
|
||||
Front cover
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## IBM Cloud Pak for Data on IBM Z
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## Executive overview
|
||||
@ -43,9 +46,7 @@ To learn more about these features, see the IBM z16 product page.
|
||||
Figure 1 on page 3 shows a picture of the IBM z16 mainframe.
|
||||
|
||||
Figure 1 IBM z16
|
||||
|
||||
<!-- image -->
|
||||
Figure 1 IBM z16
|
||||
|
||||
## IBM z16 and IBM LinuxONE Emperor 4 features
|
||||
|
||||
@ -54,18 +55,14 @@ IBM Z are based on enterprise mainframe technology. Starting with transaction-ba
|
||||
Figure 2 provides a snapshot of the IBM Z processor roadmap, which depicts the journey of transformation and improvement.
|
||||
|
||||
Figure 2 IBM Z: Processor roadmap
|
||||
|
||||
<!-- image -->
|
||||
Figure 2 IBM Z: Processor roadmap
|
||||
|
||||
The IBM z16 and IBM LinuxONE Emperor 4 are the latest of the IBM Z, and they are developed with a 'built to build' focus to provide a powerful, cyberresilient, open, and secure platform for business with an extra focus on sustainability to help build sustainable data centers. Although the z16 server can host both IBM z/OSfi and Linux workloads, LinuxONE Emperor 4 is built to host Linux only workloads with a focus on consolidation and resiliency. Depending on the workload, consolidation from numerous x86 servers into a LinuxONE Emperor 4 can help reduce energy consumption by 75% and data center floor space by 50%, which helps to achieve the sustainability goals of the organization.
|
||||
|
||||
Figure 3 on page 5 shows a summary of the system design of IBM LinuxONE Emperor 4 with the IBM Telum™ processor. The IBM Telum processor chip is designed to run enterprise applications efficiently where their data resides to embed AI with super low latency. The support for higher bandwidth and I/O rates is supported through FCP Express cards with an endpoint security solution. The memory subsystem supports up to 40 TB of memory.
|
||||
|
||||
Figure 3 System design of IBM z16 LinuxONE Emperor 4
|
||||
|
||||
<!-- image -->
|
||||
Figure 3 System design of IBM z16 LinuxONE Emperor 4
|
||||
|
||||
The IBM z16 and IBM LinuxONE Emperor 4 servers are built with 7-nm technology at a 5.2 GHz speed. They consist of four dual-chip modules (DCMs) per central processor complex (CPC) drawer, each of which is built with two 8-core Telum processor chips that has "first in the industry" on-chip acceleration for mid-transaction, real-time AI inferencing, which supports many different use cases, including fraud detection.
|
||||
|
||||
@ -74,18 +71,14 @@ Each core has access to a huge private 32 MB L2 cache where up to 16 MB of the L
|
||||
Figure 4 provides more information about the features of AI Accelerator integration with the IBM Z processor cores.
|
||||
|
||||
Figure 4 IBM z16 on-chip AI Accelerator integration with IBM Z processor cores
|
||||
|
||||
<!-- image -->
|
||||
Figure 4 IBM z16 on-chip AI Accelerator integration with IBM Z processor cores
|
||||
|
||||
The IBM z16 and IBM LinuxONE Emperor 4 server platforms are built with the hardware features that are shown in Figure 4 with addressing data and AI workloads in mind. Regardless of where the ML and deep learning (DL) frameworks are used to build and train data and AI models, the inferencing on existing enterprise application data can happen along currently running enterprise business applications. CP4D 4.6 supports Tensorflow and IBM Snap ML frameworks, which are optimized to use the on-chip AI Accelerator during inferencing. Support for various other frameworks is planned for future releases.
|
||||
|
||||
Figure 5 on page 7 shows the seamless integration of AI into existing enterprises workloads on the IBM z16 while leveraging the underlying hardware capabilities.
|
||||
|
||||
Figure 5 Seamless integration
|
||||
|
||||
<!-- image -->
|
||||
Figure 5 Seamless integration
|
||||
|
||||
## What is Cloud Pak for Data on IBM Z
|
||||
|
||||
@ -96,9 +89,7 @@ CP4D on IBM Z provides enterprises with a resilient and secure private cloud pla
|
||||
Figure 6 shows a solution overview of CP4D. The infrastructure alternatives are shown at the bottom, and they include IBM Z and LinuxONE. They all leverage Red Hat OpenShift. Common Foundational Services come next, which offer clarity throughout the data and AI lifecycle, that is, from user access management to monitoring and service provisioning. A high-level view of the services is shown in the middle section. The services have several different capabilities that span the AI hierarchy. The platform can be expanded, and it offers a seamless user experience for all distinct personas across the AI lifecycle, from data gathering through AI infusion.
|
||||
|
||||
Figure 6 Solution overview of Cloud Pak for Data
|
||||
|
||||
<!-- image -->
|
||||
Figure 6 Solution overview of Cloud Pak for Data
|
||||
|
||||
We highlight the four main pillars that make IBM Z the correct infrastructure for CP4D:
|
||||
|
||||
@ -159,9 +150,7 @@ Traditional ML models' power most of today's ML applications in business and amo
|
||||
Figure 7 on page 11 provides an overview of the components that are supported on CP4D on IBM Z. You can leverage Watson Studio for model building, training, and validation, and WML for deployment of the model. Eventually, applications can use the AI inference endpoint to score the model.
|
||||
|
||||
Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data on IBM Z and IBM LinuxONE
|
||||
|
||||
<!-- image -->
|
||||
Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data on IBM Z and IBM LinuxONE
|
||||
|
||||
In summary, here are some of the reasons why you should choose AI on IBM Z:
|
||||
|
||||
@ -254,9 +243,7 @@ The key point here is that risk exists throughout the entire AI lifecycle starti
|
||||
For example, a business can start testing a model before production for fairness metrics. For this task, enterprises need an end-to-end workflow with approvals to mitigate these risks and increase the scale of AI investments, as shown in Figure 8, which presents a typical AI model lifecycle in an enterprise.
|
||||
|
||||
Figure 8 Typical AI model lifecycle
|
||||
|
||||
<!-- image -->
|
||||
Figure 8 Typical AI model lifecycle
|
||||
|
||||
Due to regulations, more stakeholders adopt the typical AI model lifecycle to protect their brand from new end-to-end risks. To ensure various aspects of both regulatory compliance and security, the personas that must be involved include the chief financial officer (CFO), chief marketing officer (CMO), chief data officer (CDO), HR, and chief regulatory officer (CRO), along with the data engineers, data scientists, and business analysts, who build AI workflows.
|
||||
|
||||
@ -309,74 +296,54 @@ GLYPH<SM590000> Enterprises can develop AI models by creating and training model
|
||||
Figure 9 on page 16 shows the end-to-end flow for a remote AI governance solution.
|
||||
|
||||
Figure 9 Remote AI governance solution end-to-end flow
|
||||
|
||||
<!-- image -->
|
||||
Figure 9 Remote AI governance solution end-to-end flow
|
||||
|
||||
To achieve end-to-end AI governance, complete the following steps:
|
||||
|
||||
1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.
|
||||
|
||||
Figure 10 Creating a model entry in IBM OpenPages
|
||||
|
||||
<!-- image -->
|
||||
Figure 10 Creating a model entry in IBM OpenPages
|
||||
|
||||
2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.
|
||||
|
||||
Figure 11 Training an AI model by using Watson Studio
|
||||
|
||||
<!-- image -->
|
||||
Figure 11 Training an AI model by using Watson Studio
|
||||
|
||||
3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.
|
||||
|
||||
Figure 12 Deploying an AI model by using WML on Cloud Pak for Data
|
||||
|
||||
<!-- image -->
|
||||
Figure 12 Deploying an AI model by using WML on Cloud Pak for Data
|
||||
|
||||
4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.
|
||||
|
||||
Figure 13 External model
|
||||
|
||||
<!-- image -->
|
||||
Figure 13 External model
|
||||
|
||||
You can track the model through each stage of the model lifecycle, as shown in Figure 14, by using AI Factsheets and OpenPages.
|
||||
|
||||
Figure 14 Tracking the model
|
||||
|
||||
<!-- image -->
|
||||
Figure 14 Tracking the model
|
||||
|
||||
You can see that the model facts are tracked and synchronized to IBM OpenPages for risk management, as shown in Figure 15.
|
||||
|
||||
Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform
|
||||
|
||||
<!-- image -->
|
||||
Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform
|
||||
|
||||
5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.
|
||||
|
||||
Figure 16 Creating an external model on an x86 platform
|
||||
|
||||
<!-- image -->
|
||||
Figure 16 Creating an external model on an x86 platform
|
||||
|
||||
IBM OpenScale provides a comprehensive dashboard that tracks fairness, quality monitoring, drift, and explainability of a model. Fairness determines whether your model produces biased outcomes. Quality determines how well your model predicts outcomes. Drift is the degradation of predictive performance over time. A sample is shown in Figure 17 on page 21.
|
||||
|
||||
Figure 17 IBM OpenScale dashboard that is used to monitor the external model
|
||||
|
||||
<!-- image -->
|
||||
Figure 17 IBM OpenScale dashboard that is used to monitor the external model
|
||||
|
||||
You developed and deployed the AI model by using Watson Studio, WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, and end-to-end AI model governance by leveraging AI Factsheets, OpenScale, and OpenPages on CP4D on a x86 platform. Figure 18 shows end-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale.
|
||||
|
||||
Figure 18 Final result: End-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale
|
||||
|
||||
<!-- image -->
|
||||
Figure 18 Final result: End-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale
|
||||
|
||||
## Use case 2: Credit default risk assessment
|
||||
|
||||
@ -395,9 +362,7 @@ Financial institutions can leverage AI solutions by using ML techniques to predi
|
||||
Figure 19 on page 23 shows a sample architecture about how to design and develop an AI model for credit risk assessment on IBM Z. An IBM WebSpherefi Application Server is used for handling in-bound transactions, and CP4D is used for AI model lifecycle management that includes building, training, and deploying the model.
|
||||
|
||||
Figure 19 Architecture for credit risk prediction by using an ML AI model on IBM Z
|
||||
|
||||
<!-- image -->
|
||||
Figure 19 Architecture for credit risk prediction by using an ML AI model on IBM Z
|
||||
|
||||
A data scientist can leverage Watson Studio to develop and train an AI model and WML to deploy and score the model. In this sample architecture, the WML Python run time leverages the ML framework, IBM Snap Machine Learning (Snap ML), for scoring, can leverage an integrated AI accelerator at the time of model import.
|
||||
|
||||
@ -412,9 +377,7 @@ We showed how IBM Z enable customers to use AI frameworks to detect credit risk.
|
||||
Figure 20 shows an architecture for predicting credit risk by using DL on IBM Z.
|
||||
|
||||
Figure 20 Architecture for credit risk prediction by using DL on IBM Z
|
||||
|
||||
<!-- image -->
|
||||
Figure 20 Architecture for credit risk prediction by using DL on IBM Z
|
||||
|
||||
Data scientists can start creating and training a DL AI model by using a Jupyter Notebook instance and Watson Studio. Then, they can deploy the model by using WML on CP4D running on IBM Z, which provides an endpoint. Other applications, including the IBM WebSphere server, can produce credit risk results by using the model's endpoint.
|
||||
|
||||
@ -451,9 +414,7 @@ One possible solution is to build and train a TensorFlow based DL model that lea
|
||||
Figure 21 provides a high-level diagram of a clearing and settlement use case for financial transactions that uses CP4D on IBM Z and IBM LinuxONE.
|
||||
|
||||
Figure 21 Clearing and settlement use case for financial transactions by using Cloud Pak for Data
|
||||
|
||||
<!-- image -->
|
||||
Figure 21 Clearing and settlement use case for financial transactions by using Cloud Pak for Data
|
||||
|
||||
Here are the steps of the high-level process flow:
|
||||
|
||||
@ -510,9 +471,7 @@ Remaining Useful Life (RUL) is the remaining time or cycles that an aircraft eng
|
||||
Figure 22 provides an overview of the inferencing architecture for the RUL of an aircraft engine when using IBM Z.
|
||||
|
||||
Figure 22 Inferencing architecture on IBM Z
|
||||
|
||||
<!-- image -->
|
||||
Figure 22 Inferencing architecture on IBM Z
|
||||
|
||||
Because we are looking into data-driven model development, the data set of our target is the run-to-failure data of the engine. We are looking into a supervised learning problem, and we use regression techniques to learn from the data. DL techniques such as Long Short-Term Memory (LSTM) or Gated Recurrent Units (GRU) are our choice because we are looking into a time series data set. TensorFlow or PyTorch frameworks are leveraged to create models. AI governance monitors the data and model drift to maintain the model quality throughout the model's life.
|
||||
|
||||
@ -533,9 +492,7 @@ Client-side applications can invoke a REST apiserver that handles some preproces
|
||||
Figure 23 on page 29 provides a more in-depth view of the architecture of an AI-based predictive maintenance application.
|
||||
|
||||
Figure 23 In-depth architectural view
|
||||
|
||||
<!-- image -->
|
||||
Figure 23 In-depth architectural view
|
||||
|
||||
In summary, consider the following points while developing an AI-based predictive maintenance application:
|
||||
|
||||
@ -576,9 +533,7 @@ AI is the current "market trend evolution" in video analytics and advancing the
|
||||
S
|
||||
|
||||
Figure 24 Architecture for AI-powered video analytics
|
||||
|
||||
<!-- image -->
|
||||
Figure 24 Architecture for AI-powered video analytics
|
||||
|
||||
Live camera feeds or recorded videos of an infant's movement are the inputs for a pose detection model. This video streaming data was stored in IBM Cloudfi Object Storage for image processing. Video data must be transformed into frames so that the infant's body poses can be detected. These post-estimation components of the pipeline predict the location of all 17-person key points with 3 degrees of freedom each (x, y location and visibility) plus two virtual alignment key points. This approach also embraces a compute-intensive heat map prediction of infant body posture.
|
||||
|
||||
@ -698,6 +653,7 @@ IBM, the IBM logo, and ibm.com are trademarks or registered trademarks of Intern
|
||||
|
||||
The following terms are trademarks or registered trademarks of International Business Machines Corporation, and might also be trademarks or registered trademarks in other countries.
|
||||
|
||||
|
||||
| Db2fi IBMfi | IBM Watsonfi | Redbooks (log o) fi Turbon |
|
||||
|----------------------|----------------|------------------------------|
|
||||
| | IBM z16™ | omicfi |
|
||||
@ -718,6 +674,7 @@ UNIX is a registered trademark of The Open Group in the United States and other
|
||||
|
||||
Other company, product, or service names may be trademarks or service marks of others.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Back cover
|
||||
@ -728,4 +685,5 @@ ISBN 0738461067
|
||||
|
||||
Printed in U.S.A.
|
||||
|
||||
|
||||
<!-- image -->
|
Loading…
Reference in New Issue
Block a user