fix: Fixes for wordx (#432)

* fixes for referencing drawing blip in wordx

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Added safety try-except when trying to load pillow image from a docx blob. Added explicit dependency on lxml.

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Added test for word file with embedded emf images, re-generated full tests for docx, eased up dependency on lxml

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Updated lxml dependency version

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak
2024-11-26 14:44:43 +01:00
committed by GitHub
parent d7072b4b56
commit d0a1180478
13 changed files with 1295 additions and 152 deletions

Binary file not shown.

View File

@@ -0,0 +1,10 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: list: group list
item-2 at level 2: list_item: Hello world1
item-3 at level 2: list_item: Hello2
item-4 at level 1: paragraph:
item-5 at level 1: paragraph: Some text before
item-6 at level 1: table with [3x3]
item-7 at level 1: paragraph:
item-8 at level 1: paragraph:
item-9 at level 1: paragraph: Some text after

View File

@@ -0,0 +1,392 @@
{
"schema_name": "DoclingDocument",
"version": "1.0.0",
"name": "tablecell",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 1111850039819445035,
"filename": "tablecell.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/tables/0"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
}
],
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
}
],
"name": "list",
"label": "list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"label": "list_item",
"prov": [],
"orig": "Hello world1",
"text": "Hello world1",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"label": "list_item",
"prov": [],
"orig": "Hello2",
"text": "Hello2",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"prov": [],
"orig": "Some text before",
"text": "Some text before"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"prov": [],
"orig": "Some text after",
"text": "Some text after"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Tab1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Tab2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Tab3",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "C",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "D",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "F",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 3,
"num_cols": 3,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Tab1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Tab2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "Tab3",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "C",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "D",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "E",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "F",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
}
}
],
"key_value_items": [],
"pages": {}
}

View File

@@ -0,0 +1,11 @@
- Hello world1
- Hello2
Some text before
| Tab1 | Tab2 | Tab3 |
|--------|--------|--------|
| A | B | C |
| D | E | F |
Some text after

View File

@@ -0,0 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Test with three images in unusual formats
item-2 at level 1: paragraph: Raster in emf:
item-3 at level 1: picture
item-4 at level 1: paragraph: Vector in emf:
item-5 at level 1: picture
item-6 at level 1: paragraph: Raster in webp:
item-7 at level 1: picture

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,13 @@
Test with three images in unusual formats
Raster in emf:
<!-- image -->
Vector in emf:
<!-- image -->
Raster in webp:
<!-- image -->

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,45 @@
Summer activities
# Swimming in the lake
Duck
<!-- image -->
Figure 1: This is a cute duckling
## Lets swim!
To get started with swimming, first lay down in a water and try not to drown:
- You can relax and look around
- Paddle about
- Enjoy summer warmth
Also, dont forget:
- Wear sunglasses
- Dont forget to drink water
- Use sun cream
Hmm, what else…
### Lets eat
After we had a good day of swimming in the lake, its important to eat something nice
I like to eat leaves
Here are some interesting things a respectful duck could eat:
| | Food | Calories per portion |
|---------|----------------------------------|------------------------|
| Leaves | Ash, Elm, Maple | 50 |
| Berries | Blueberry, Strawberry, Cranberry | 150 |
| Grain | Corn, Buckwheat, Barley | 200 |
And lets add another list in the end:
- Leaves
- Berries
- Grain

File diff suppressed because one or more lines are too long