diff --git a/poetry.lock b/poetry.lock index 7242c6d7..407ecc66 100644 --- a/poetry.lock +++ b/poetry.lock @@ -897,8 +897,8 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] [package.source] type = "git" url = "https://github.com/docling-project/docling-core.git" -reference = "80704dce4b21656b50bf04182d4636f448c49827" -resolved_reference = "80704dce4b21656b50bf04182d4636f448c49827" +reference = "30c2e87536fb2fded0170baed39b4d001d620823" +resolved_reference = "30c2e87536fb2fded0170baed39b4d001d620823" [[package]] name = "docling-ibm-models" @@ -7842,4 +7842,4 @@ vlm = ["accelerate", "transformers", "transformers"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "891d40e03852f5de1979c76134f1ea6b4876abf6cb14a27c3c22f4aa3d0e37f4" +content-hash = "356cabb283e4d64913d3345d8b387a47f38ed69f358914112e2b1f11c498d999" diff --git a/pyproject.toml b/pyproject.toml index 5b818109..a3c2dbf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ packages = [{ include = "docling" }] python = "^3.9" pydantic = "^2.0.0" # docling-core = { git = "https://github.com/docling-project/docling-core.git", extras = ["chunking"], branch = "main" } -docling-core = { git = "https://github.com/docling-project/docling-core.git", extras = ["chunking"], rev = "80704dce4b21656b50bf04182d4636f448c49827" } +docling-core = { git = "https://github.com/docling-project/docling-core.git", extras = ["chunking"], rev = "30c2e87536fb2fded0170baed39b4d001d620823" } docling-ibm-models = "^3.4.0" docling-parse = "^4.0.0" filetype = "^1.2.0" diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt index 81761b1a..9f023791 100644 --- a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt +++ b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt @@ -229,6 +229,7 @@ +Figure 8: Example of a table with multi-line header. Figure 9: Example of a table with big empty distance between cells. @@ -243,10 +244,12 @@ +Figure 11: Simple table with different style and empty cells. +Figure 12: Simple table predictions and post processing. @@ -270,6 +273,7 @@ +Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact. 15 Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure. diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.md b/tests/data/groundtruth/docling_v2/2203.01017v2.md index b990b1a0..6204f60f 100644 --- a/tests/data/groundtruth/docling_v2/2203.01017v2.md +++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md @@ -395,6 +395,8 @@ phan cell. Aditional images with examples of TableFormer predictions and post-processing can be found below. +Figure 8: Example of a table with multi-line header. + Figure 9: Example of a table with big empty distance between cells. @@ -405,8 +407,12 @@ Figure 10: Example of a complex table with empty cells. +Figure 11: Simple table with different style and empty cells. + +Figure 12: Simple table predictions and post processing. + Figure 13: Table predictions example on colorful table. @@ -427,6 +433,8 @@ Figure 15: Example with triangular table. +Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact. + Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt index a2b799ae..8bd743dd 100644 --- a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt +++ b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt @@ -82,6 +82,7 @@ Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations 05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0 +Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous. were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar. Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted diff --git a/tests/data/groundtruth/docling_v2/2206.01062.md b/tests/data/groundtruth/docling_v2/2206.01062.md index 24261534..1a323e9a 100644 --- a/tests/data/groundtruth/docling_v2/2206.01062.md +++ b/tests/data/groundtruth/docling_v2/2206.01062.md @@ -152,6 +152,8 @@ Phase 3: Training. After a first trial with a small group of people, we realised 05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0 +Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous. + were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar. Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted diff --git a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt index 51b2fee9..4b5d8b7d 100644 --- a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt +++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt @@ -105,6 +105,7 @@ The FUNCTION_USAGE view contains function usage configuration details. Table 2-1 describes the columns in the FUNCTION_USAGE view. Column nameData typeDescriptionFUNCTION_IDVARCHAR(30)ID of the function.USER_NAMEVARCHAR(10)Name of the user profile that has a usage setting for this function.USAGEVARCHAR(7)Usage setting: GLYPH ALLOWED: The user profile is allowed to use the function. GLYPH DENIED: The user profile is not allowed to use the function.USER_TYPEVARCHAR(5)Type of user profile: GLYPH USER: The user profile is a user. GLYPH GROUP: The user profile is a group.Table 2-1 FUNCTION_USAGE view To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1. +Example 2-1 Query to determine who has authority to define and manage RCAC SELECT function_id, user_name, @@ -131,12 +132,14 @@ Chapter 2. Roles and separation of duties 11 +The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules. Figure 3-1 CREATE PERMISSION SQL statement Column mask A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number. Chapter 3. Row and Column Access Control 15 +Table 3-1 summarizes these special registers and their values. Special registerCorresponding valueUSER or SESSION_USERThe effective user of the thread excluding adopted authority.CURRENT_USERThe effective user of the thread including adopted authority. When no adopted authority is present, this has the same value as USER.SYSTEM_USERThe authorization ID that initiated the connection.Table 3-1 Special registers and their corresponding values Figure 3-5 shows the difference in the special register values when an adopted authority is used: GLYPH A user connects to the server using the user profile ALICE. diff --git a/tests/data/groundtruth/docling_v2/redp5110_sampled.md b/tests/data/groundtruth/docling_v2/redp5110_sampled.md index 55e37645..bff19284 100644 --- a/tests/data/groundtruth/docling_v2/redp5110_sampled.md +++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.md @@ -199,6 +199,8 @@ Table 2-1 FUNCTION\_USAGE view To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1. +Example 2-1 Query to determine who has authority to define and manage RCAC + SELECT function\_id, @@ -254,6 +256,8 @@ Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority | MODIFY PLAN CACHE PROPERTIES procedure (currently does not check authority) | X | | X | | | | CHANGE PLAN CACHE SIZE procedure (currently does not check authority) | X | | X | | | +The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules. + Figure 3-1 CREATE PERMISSION SQL statement @@ -262,6 +266,8 @@ Figure 3-1 CREATE PERMISSION SQL statement A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number. +Table 3-1 summarizes these special registers and their values. + Table 3-1 Special registers and their corresponding values | Special register | Corresponding value |