mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
update more tests
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
8895fb546f
commit
beab8ce3ba
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -6,20 +6,48 @@ Some text
|
||||
|
||||
Here is a table:
|
||||
|
||||
| Character | Name in German | Name in French | Name in Italian |
|
||||
|----------------|------------------|------------------|-------------------|
|
||||
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
|
||||
| Huey | Tick | Riri | Qui |
|
||||
| Dewey | Trick | Fifi | Quo |
|
||||
| Louie | Track | Loulou | Qua |
|
||||
<table>
|
||||
<tr>
|
||||
<th>Character</th>
|
||||
<th>Name in German</th>
|
||||
<th>Name in French</th>
|
||||
<th>Name in Italian</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Scrooge McDuck</td>
|
||||
<td>Dagobert Duck</td>
|
||||
<td>Balthazar Picsou</td>
|
||||
<td>Paperone</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Huey</td>
|
||||
<td>Tick</td>
|
||||
<td>Riri</td>
|
||||
<td>Qui</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Dewey</td>
|
||||
<td>Trick</td>
|
||||
<td>Fifi</td>
|
||||
<td>Quo</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Louie</td>
|
||||
<td>Track</td>
|
||||
<td>Loulou</td>
|
||||
<td>Qua</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
And here is more HTML:
|
||||
|
||||
Some paragraph.
|
||||
|
||||
Now a div — almost there...
|
||||
|
||||
- foo
|
||||
- bar
|
||||
<p>Some paragraph.</p>
|
||||
<div>
|
||||
<p>Now a div — almost there...</p>
|
||||
<ul>
|
||||
<li>foo</li>
|
||||
<li>bar</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
The end!
|
||||
|
@ -1,5 +1,7 @@
|
||||
## Some heading
|
||||
|
||||
<!-- This is HTML -->
|
||||
|
||||
- A. first
|
||||
- subitem
|
||||
- B. second
|
||||
|
@ -24,8 +24,40 @@ A list featuring nesting:
|
||||
|
||||
A nested HTML list:
|
||||
|
||||
- First item
|
||||
- Second item with subitems:
|
||||
- Subitem 1
|
||||
- Subitem 2
|
||||
- Last list item
|
||||
<ul>
|
||||
<li>First item</li>
|
||||
<li>Second item with subitems:
|
||||
<ul>
|
||||
<li>Subitem 1</li>
|
||||
<li>Subitem 2</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Last list item</li>
|
||||
</ul>
|
||||
<!--
|
||||
Table nesting apparently not yet suported by HTML backend:
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td>Cell</td>
|
||||
<td>Nested Table
|
||||
<table>
|
||||
<tr>
|
||||
<td>Cell 1</td>
|
||||
<>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 3</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 4</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr><td>additional row</td></tr>
|
||||
</table>
|
||||
-->
|
||||
|
@ -1,74 +1,129 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<html>
|
||||
<head>
|
||||
<link rel="icon" type="image/png"
|
||||
href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
|
||||
<meta charset="UTF-8">
|
||||
<title>
|
||||
Powered by Docling
|
||||
</title>
|
||||
<style>
|
||||
<meta charset="UTF-8">
|
||||
<title>word_tables</title>
|
||||
<meta name="generator" content="Docling HTML Serializer">
|
||||
<style>
|
||||
html {
|
||||
background-color: LightGray;
|
||||
background-color: #f5f5f5;
|
||||
font-family: Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
}
|
||||
body {
|
||||
margin: 0 auto;
|
||||
width:800px;
|
||||
padding: 30px;
|
||||
background-color: White;
|
||||
font-family: Arial, sans-serif;
|
||||
box-shadow: 10px 10px 10px grey;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
padding: 2rem;
|
||||
background-color: white;
|
||||
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
||||
}
|
||||
figure{
|
||||
display: block;
|
||||
width: 100%;
|
||||
margin: 0px;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: #333;
|
||||
margin-top: 1.5em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
img {
|
||||
display: block;
|
||||
margin: auto;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
max-width: 640px;
|
||||
max-height: 640px;
|
||||
h1 {
|
||||
font-size: 2em;
|
||||
border-bottom: 1px solid #eee;
|
||||
padding-bottom: 0.3em;
|
||||
}
|
||||
table {
|
||||
min-width:500px;
|
||||
background-color: White;
|
||||
border-collapse: collapse;
|
||||
cell-padding: 5px;
|
||||
margin: auto;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
border-collapse: collapse;
|
||||
margin: 1em 0;
|
||||
width: 100%;
|
||||
}
|
||||
th, td {
|
||||
border: 1px solid black;
|
||||
padding: 8px;
|
||||
border: 1px solid #ddd;
|
||||
padding: 8px;
|
||||
text-align: left;
|
||||
}
|
||||
th {
|
||||
font-weight: bold;
|
||||
background-color: #f2f2f2;
|
||||
font-weight: bold;
|
||||
}
|
||||
table tr:nth-child(even) td{
|
||||
background-color: LightGray;
|
||||
figure {
|
||||
margin: 1.5em 0;
|
||||
text-align: center;
|
||||
}
|
||||
math annotation {
|
||||
display: none;
|
||||
figcaption {
|
||||
color: #666;
|
||||
font-style: italic;
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
pre {
|
||||
background-color: #f6f8fa;
|
||||
border-radius: 3px;
|
||||
padding: 1em;
|
||||
overflow: auto;
|
||||
}
|
||||
code {
|
||||
font-family: monospace;
|
||||
background-color: #f6f8fa;
|
||||
padding: 0.2em 0.4em;
|
||||
border-radius: 3px;
|
||||
}
|
||||
pre code {
|
||||
background-color: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
.formula {
|
||||
text-align: center;
|
||||
padding: 0.5em;
|
||||
margin: 1em 0;
|
||||
background-color: #f9f9f9;
|
||||
}
|
||||
.formula-not-decoded {
|
||||
background: repeating-linear-gradient(
|
||||
45deg, /* Angle of the stripes */
|
||||
LightGray, /* First color */
|
||||
LightGray 10px, /* Length of the first color */
|
||||
White 10px, /* Second color */
|
||||
White 20px /* Length of the second color */
|
||||
);
|
||||
margin: 0;
|
||||
text-align: center;
|
||||
text-align: center;
|
||||
padding: 0.5em;
|
||||
margin: 1em 0;
|
||||
background: repeating-linear-gradient(
|
||||
45deg,
|
||||
#f0f0f0,
|
||||
#f0f0f0 10px,
|
||||
#f9f9f9 10px,
|
||||
#f9f9f9 20px
|
||||
);
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
.page-break {
|
||||
page-break-after: always;
|
||||
border-top: 1px dashed #ccc;
|
||||
margin: 2em 0;
|
||||
}
|
||||
.key-value-region {
|
||||
background-color: #f9f9f9;
|
||||
padding: 1em;
|
||||
border-radius: 4px;
|
||||
margin: 1em 0;
|
||||
}
|
||||
.key-value-region dt {
|
||||
font-weight: bold;
|
||||
}
|
||||
.key-value-region dd {
|
||||
margin-left: 1em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
.form-container {
|
||||
border: 1px solid #ddd;
|
||||
padding: 1em;
|
||||
border-radius: 4px;
|
||||
margin: 1em 0;
|
||||
}
|
||||
.form-item {
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
.image-classification {
|
||||
font-size: 0.9em;
|
||||
color: #666;
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class='page'>
|
||||
<h2>Test with tables</h2>
|
||||
<p>A uniform table</p>
|
||||
<table><tbody><tr><th>Header 0.0</th><th>Header 0.1</th><th>Header 0.2</th></tr><tr><td>Cell 1.0</td><td>Cell 1.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.1</td><td>Cell 2.2</td></tr></tbody></table>
|
||||
@ -86,4 +141,6 @@
|
||||
<table><tbody><tr><th>Header 0.0</th><th>Header 0.1</th><th>Header 0.2</th><th></th><th></th></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td><td></td><td></td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td><td></td><td></td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td><td rowspan="3"></td><td></td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td><td rowspan="2">Merged Cell 4.4 5.4</td></tr><tr><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td colspan="5"></td></tr><tr><td></td><td></td><td></td><td></td><td>Cell 8.4</td></tr></tbody></table>
|
||||
<p></p>
|
||||
<p></p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
@ -1 +1 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.6796630536824, 689.0124221922704, 504.8720051760782, 764.9216921155637], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
File diff suppressed because one or more lines are too long
@ -1,2 +1,2 @@
|
||||
<doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
<doctag><text><loc_59><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
|
||||
</doctag>
|
@ -1 +1 @@
|
||||
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
||||
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.6796630536824, "t": 764.9216921155637, "r": 504.8720051760782, "b": 689.0124221922704, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user