Merge branch 'main' of github.com:DS4SD/docling into cau/docling-parse-init

This commit is contained in:
Christoph Auer 2024-09-18 16:53:17 +02:00
commit 7684ba95c8
17 changed files with 546 additions and 443 deletions

20
.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,20 @@
<!-- Thank you for contributing to Docling! -->
<!-- STEPS TO FOLLOW:
1. Add a description of the changes (frequently the same as the commit description)
2. Enter the issue number next to "Resolves #" below (if there is no tracking issue resolved, **remove that section**)
3. Follow the steps in the checklist below, starting with the **Commit Message Formatting**.
-->
<!-- Uncomment this section with the issue number if an issue is being resolved
**Issue resolved by this Pull Request:**
Resolves #
--->
**Checklist:**
- [ ] **Commit Message Formatting**: Commit titles and messages follow guidelines in the
[conventional commits](https://www.conventionalcommits.org/en/v1.0.0/#summary).
- [ ] Documentation has been updated, if necessary.
- [ ] Examples have been added, if necessary.
- [ ] Tests have been added, if necessary.

View File

@ -1,3 +1,23 @@
## [v1.13.0](https://github.com/DS4SD/docling/releases/tag/v1.13.0) - 2024-09-18
### Feature
* Add table exports ([#86](https://github.com/DS4SD/docling/issues/86)) ([`f19bd43`](https://github.com/DS4SD/docling/commit/f19bd437984f77067d33d591e25c5d5c92d7e0a9))
### Fix
* Bumped the glm version and adjusted the tests ([#83](https://github.com/DS4SD/docling/issues/83)) ([`442443a`](https://github.com/DS4SD/docling/commit/442443a102d91b19a7eb38b316dada89c86ea8a8))
### Documentation
* Updated Docling logo.png with transparent background ([#88](https://github.com/DS4SD/docling/issues/88)) ([`0da7519`](https://github.com/DS4SD/docling/commit/0da75198967c9cffd42be3f3acd6ade2341fc1f5))
## [v1.12.2](https://github.com/DS4SD/docling/releases/tag/v1.12.2) - 2024-09-17
### Fix
* **tests:** Adjust the test data to match the new version of LayoutPredictor ([#82](https://github.com/DS4SD/docling/issues/82)) ([`fa9699f`](https://github.com/DS4SD/docling/commit/fa9699fa3cd2d367382d7b952d0365983a870848))
## [v1.12.1](https://github.com/DS4SD/docling/releases/tag/v1.12.1) - 2024-09-16 ## [v1.12.1](https://github.com/DS4SD/docling/releases/tag/v1.12.1) - 2024-09-16
### Fix ### Fix

View File

@ -9,67 +9,6 @@ from docling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
def _export_table_to_html(table: Table):
# TODO: this is flagged as internal, because we will move it
# to the docling-core package.
def _get_tablecell_span(cell: TableCell, ix):
if cell.spans is None:
span = set()
else:
span = set([s[ix] for s in cell.spans])
if len(span) == 0:
return 1, None, None
return len(span), min(span), max(span)
body = ""
nrows = table.num_rows
ncols = table.num_cols
if table.data is None:
return ""
for i in range(nrows):
body += "<tr>"
for j in range(ncols):
cell: TableCell = table.data[i][j]
rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
colspan, colstart, colend = _get_tablecell_span(cell, 1)
if rowstart is not None and rowstart != i:
continue
if colstart is not None and colstart != j:
continue
if rowstart is None:
rowstart = i
if colstart is None:
colstart = j
content = cell.text.strip()
label = cell.obj_type
label_class = "body"
celltag = "td"
if label in ["row_header", "row_multi_header", "row_title"]:
label_class = "header"
elif label in ["col_header", "col_multi_header"]:
label_class = "header"
celltag = "th"
opening_tag = f"{celltag}"
if rowspan > 1:
opening_tag += f' rowspan="{rowspan}"'
if colspan > 1:
opening_tag += f' colspan="{colspan}"'
body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"
body = f"<table>{body}</table>"
return body
def generate_multimodal_pages( def generate_multimodal_pages(
doc_result: ConversionResult, doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
@ -129,7 +68,7 @@ def generate_multimodal_pages(
} }
if isinstance(item, Table): if isinstance(item, Table):
table_html = _export_table_to_html(item) table_html = item.export_to_html()
new_segment["data"].append( new_segment["data"].append(
{ {
"html_seq": table_html, "html_seq": table_html,

74
examples/export_tables.py Normal file
View File

@ -0,0 +1,74 @@
import logging
import time
from pathlib import Path
from typing import Tuple
import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter()
start_time = time.time()
conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.output.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = (
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
)
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
success_count += 1
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

BIN
logo.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 690 KiB

After

Width:  |  Height:  |  Size: 677 KiB

765
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "docling" name = "docling"
version = "1.12.1" # DO NOT EDIT, updated automatically version = "1.13.0" # DO NOT EDIT, updated automatically
description = "Docling PDF conversion package" description = "Docling PDF conversion package"
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT" license = "MIT"
@ -23,9 +23,9 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = "^1.3.0" docling-core = "^1.4.0"
docling-ibm-models = "^1.1.7" docling-ibm-models = "^1.2.0"
deepsearch-glm = "^0.21.0" deepsearch-glm = "^0.21.1"
filetype = "^1.2.0" filetype = "^1.2.0"
pypdfium2 = "^4.30.0" pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0" pydantic-settings = "^2.3.0"

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1776,6 +1776,10 @@ An important design and implementation consideration is the fact that RCAC colum
An example of this situation is shown in Figure 6-1. However, note that aggregate functions (a form of grouping) are based on masked values. An example of this situation is shown in Figure 6-1. However, note that aggregate functions (a form of grouping) are based on masked values.
SELECT
FROM GROUP BY ORDER BY
## Without RCAC Masking ## Without RCAC Masking
## With RCAC Masking ## With RCAC Masking
@ -1808,6 +1812,12 @@ Figure 6-1 Timing of column masking
| **** **** **** 1234 | 750.33 | | **** **** **** 1234 | 750.33 |
| **** **** **** 0001 | 10.00 | | **** **** **** 0001 | 10.00 |
CREDIT_CARD_NUMBER, SUM(AMOUNT) AS TOTAL TRANSACTIONS
CREDIT_CARD_NUMBER
CREDIT_CARD_NUMBER;
Conversely, field procedure masking causes the column values to be changed (that is, masked) and stored in the row. When the table is queried and the masked columns are referenced, the masked data is used for any local selection, joining, grouping, or ordering operations. This situation can have a profound effect on the query's final result set and not just on the column values that are returned. Field procedure masking occurs when the column values are read from disk before any query processing. RCAC masking occurs when the column values are returned to the application after query processing. This difference in behavior is shown in Figure 6-2. Conversely, field procedure masking causes the column values to be changed (that is, masked) and stored in the row. When the table is queried and the masked columns are referenced, the masked data is used for any local selection, joining, grouping, or ordering operations. This situation can have a profound effect on the query's final result set and not just on the column values that are returned. Field procedure masking occurs when the column values are read from disk before any query processing. RCAC masking occurs when the column values are returned to the application after query processing. This difference in behavior is shown in Figure 6-2.
Note: Column masks can influence an SQL INSERT or UPDATE . For example, you cannot insert or update a table with column access control activated with masked data generated from an expression within the same statement that is based on a column with a column mask. Note: Column masks can influence an SQL INSERT or UPDATE . For example, you cannot insert or update a table with column access control activated with masked data generated from an expression within the same statement that is based on a column with a column mask.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -96,10 +96,17 @@ def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
for i, row in enumerate(true_item.data): for i, row in enumerate(true_item.data):
for j, col in enumerate(true_item.data[i]): for j, col in enumerate(true_item.data[i]):
# print("true: ", true_item.data[i][j])
# print("pred: ", pred_item.data[i][j])
assert ( assert (
true_item.data[i][j].text == pred_item.data[i][j].text true_item.data[i][j].text == pred_item.data[i][j].text
), "table-cell does not have the same text" ), "table-cell does not have the same text"
assert (
true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type
), "table-cell does not have the same type"
return True return True
@ -156,9 +163,13 @@ def verify_conversion_result(
), f"Mismatch in PDF cell prediction for {input_path}" ), f"Mismatch in PDF cell prediction for {input_path}"
# assert verify_output( # assert verify_output(
# doc_pred, doc_true # doc_pred, doc_true
# ), f"Mismatch in JSON prediction for {input_path}" # ), f"Mismatch in JSON prediction for {input_path}"
assert verify_tables(
doc_pred, doc_true
), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}"
assert verify_md( assert verify_md(
doc_pred_md, doc_true_md doc_pred_md, doc_true_md
), f"Mismatch in Markdown prediction for {input_path}" ), f"Mismatch in Markdown prediction for {input_path}"