mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 12:04:31 +00:00
fix: when .simplify_text_elements() always put a space between chunks, checks for alphanumeric characters creates more problems than it does good. commit new that testfiles that got forgotten in the last commit.
This commit is contained in:
parent
0c88c5b90f
commit
08beb406d9
@ -84,13 +84,6 @@ class AnnotatedTextList(list):
|
||||
for i in range(1, len(self)):
|
||||
if hyperlink == self[i].hyperlink:
|
||||
sep = " "
|
||||
if (
|
||||
text
|
||||
and re.match(r"\w", text[-1])
|
||||
and self[i].text
|
||||
and re.match(r"\w", self[i].text[0])
|
||||
):
|
||||
sep = " "
|
||||
text += sep + self[i].text
|
||||
else:
|
||||
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
||||
|
6
tests/data/groundtruth/docling_v2/hyperlink_01.html.itxt
Normal file
6
tests/data/groundtruth/docling_v2/hyperlink_01.html.itxt
Normal file
@ -0,0 +1,6 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Something
|
||||
item-2 at level 2: inline: group group
|
||||
item-3 at level 3: text: Please follow the link to:
|
||||
item-4 at level 3: text: This page
|
||||
item-5 at level 3: text: .
|
110
tests/data/groundtruth/docling_v2/hyperlink_01.html.json
Normal file
110
tests/data/groundtruth/docling_v2/hyperlink_01.html.json
Normal file
@ -0,0 +1,110 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "hyperlink_01",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 17149231461445569313,
|
||||
"filename": "hyperlink_01.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Something",
|
||||
"text": "Something"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Please follow the link to:",
|
||||
"text": "Please follow the link to:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This page",
|
||||
"text": "This page",
|
||||
"hyperlink": "#"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ".",
|
||||
"text": "."
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
3
tests/data/groundtruth/docling_v2/hyperlink_01.html.md
Normal file
3
tests/data/groundtruth/docling_v2/hyperlink_01.html.md
Normal file
@ -0,0 +1,3 @@
|
||||
# Something
|
||||
|
||||
Please follow the link to: [This page](#) .
|
3
tests/data/groundtruth/docling_v2/hyperlink_02.html.itxt
Normal file
3
tests/data/groundtruth/docling_v2/hyperlink_02.html.itxt
Normal file
@ -0,0 +1,3 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group header-1
|
||||
item-2 at level 2: section_header: Home
|
83
tests/data/groundtruth/docling_v2/hyperlink_02.html.json
Normal file
83
tests/data/groundtruth/docling_v2/hyperlink_02.html.json
Normal file
@ -0,0 +1,83 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "hyperlink_02",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 15683290523889238210,
|
||||
"filename": "hyperlink_02.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/pictures/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "header-1",
|
||||
"label": "section"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Home",
|
||||
"text": "Home",
|
||||
"hyperlink": "/home.html",
|
||||
"level": 1
|
||||
}
|
||||
],
|
||||
"pictures": [
|
||||
{
|
||||
"self_ref": "#/pictures/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
1
tests/data/groundtruth/docling_v2/hyperlink_02.html.md
Normal file
1
tests/data/groundtruth/docling_v2/hyperlink_02.html.md
Normal file
@ -0,0 +1 @@
|
||||
[## Home](/home.html)
|
11
tests/data/groundtruth/docling_v2/hyperlink_03.html.itxt
Normal file
11
tests/data/groundtruth/docling_v2/hyperlink_03.html.itxt
Normal file
@ -0,0 +1,11 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: list: group list
|
||||
item-2 at level 2: list_item: My Section
|
||||
item-3 at level 3: list: group list
|
||||
item-4 at level 4: list_item: Some page
|
||||
item-5 at level 5: list: group list
|
||||
item-6 at level 6: list_item: A sub page
|
||||
item-7 at level 5: list: group list
|
||||
item-8 at level 6: list_item: This is my Homepage
|
||||
item-9 at level 6: list_item: Main navigation
|
||||
item-10 at level 2: list_item: My organisation
|
200
tests/data/groundtruth/docling_v2/hyperlink_03.html.json
Normal file
200
tests/data/groundtruth/docling_v2/hyperlink_03.html.json
Normal file
@ -0,0 +1,200 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "hyperlink_03",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 14556394815653517177,
|
||||
"filename": "hyperlink_03.html"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/3",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "My Section",
|
||||
"text": "My Section",
|
||||
"hyperlink": "#",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Some page",
|
||||
"text": "Some page",
|
||||
"hyperlink": "/start.html",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "A sub page",
|
||||
"text": "A sub page",
|
||||
"hyperlink": "/home2.html",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "This is my Homepage",
|
||||
"text": "This is my Homepage",
|
||||
"hyperlink": "/home.html",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Main navigation",
|
||||
"text": "Main navigation",
|
||||
"hyperlink": "#main-navigation",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "My organisation",
|
||||
"text": "My organisation",
|
||||
"hyperlink": "#",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
6
tests/data/groundtruth/docling_v2/hyperlink_03.html.md
Normal file
6
tests/data/groundtruth/docling_v2/hyperlink_03.html.md
Normal file
@ -0,0 +1,6 @@
|
||||
- [My Section](#)
|
||||
- [Some page](/start.html)
|
||||
- [A sub page](/home2.html)
|
||||
- [This is my Homepage](/home.html)
|
||||
- [Main navigation](#main-navigation)
|
||||
- [My organisation](#)
|
@ -251,7 +251,7 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-250 at level 3: inline: group group
|
||||
item-251 at level 4: text: The word duck comes from
|
||||
item-252 at level 4: text: Old English
|
||||
item-253 at level 4: text: dūce'diver', a derivative of the ... because of the way many species in the
|
||||
item-253 at level 4: text: dūce 'diver', a derivative of th ... because of the way many species in the
|
||||
item-254 at level 4: text: dabbling duck
|
||||
item-255 at level 4: text: group feed by upending; compare with
|
||||
item-256 at level 4: text: Dutch
|
||||
@ -261,7 +261,7 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-260 at level 3: picture
|
||||
item-260 at level 4: caption: Pacific black duck displaying the characteristic upending "duck"
|
||||
item-261 at level 3: inline: group group
|
||||
item-262 at level 4: text: This word replaced Old English e ... r example, Dutch eend, German Ente and
|
||||
item-262 at level 4: text: This word replaced Old English e ... example, Dutch eend , German Ente and
|
||||
item-263 at level 4: text: Norwegian
|
||||
item-264 at level 4: text: and . The word ened / ænid was inherited from
|
||||
item-265 at level 4: text: Proto-Indo-European
|
||||
@ -523,7 +523,7 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-512 at level 5: text: [ 29 ]
|
||||
item-513 at level 5: text: [
|
||||
item-514 at level 5: text: self-published source?
|
||||
item-515 at level 5: text: ]but, despite widespread misconc ... , most species of duck do not "quack".
|
||||
item-515 at level 5: text: ] but, despite widespread miscon ... , most species of duck do not "quack".
|
||||
item-516 at level 5: text: [ 30 ]
|
||||
item-517 at level 5: text: In general, ducks make a range of
|
||||
item-518 at level 5: text: calls
|
||||
|
17
tests/data/html/hyperlink_01.html
Normal file
17
tests/data/html/hyperlink_01.html
Normal file
@ -0,0 +1,17 @@
|
||||
<html>
|
||||
|
||||
<body>
|
||||
<h1>Something</h1>
|
||||
<p>
|
||||
Please follow the link to:
|
||||
<a href="#">
|
||||
<span class="icon icon--right"></span> This page
|
||||
</a>
|
||||
.
|
||||
</p>
|
||||
<div class="mod mod-contentpage">
|
||||
|
||||
</div>
|
||||
</body>
|
||||
|
||||
</html>
|
18
tests/data/html/hyperlink_02.html
Normal file
18
tests/data/html/hyperlink_02.html
Normal file
@ -0,0 +1,18 @@
|
||||
<html>
|
||||
|
||||
<body>
|
||||
<div class="nav-mobile-header">
|
||||
<div class="table-row">
|
||||
<span class="nav-mobile-logo">
|
||||
<img src="/etc/designs/core/frontend/guidelines/img/xyz.svg"
|
||||
onerror="this.onerror=null; this.src='/etc/designs/core/frontend/guidelines/img/xyz.png'"
|
||||
alt="Image alt text" />
|
||||
</span>
|
||||
<h2>
|
||||
<a href="/home.html" title="My home page " aria-label="My home page ">Home</a>
|
||||
</h2>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
|
||||
</html>
|
31
tests/data/html/hyperlink_03.html
Normal file
31
tests/data/html/hyperlink_03.html
Normal file
@ -0,0 +1,31 @@
|
||||
<html>
|
||||
|
||||
<body>
|
||||
<ul class="nav navbar-nav">
|
||||
<li class="dropdown">
|
||||
<a id="main-dropdown" href="#" aria-label="My Section" class="dropdown-toggle" data-toggle="dropdown"><span
|
||||
class="icon icon--right"></span> My Section</a>
|
||||
<ul class="dropdown-menu" role="menu">
|
||||
<li class="dropdown-header">
|
||||
<a href="/start.html" aria-label="Some page" target="_blank" title="">Some
|
||||
page</a>
|
||||
<ul>
|
||||
<li>
|
||||
<a href="/home2.html" aria-label="Some other page" target="_blank" title=""> A sub page</a>
|
||||
</li>
|
||||
</ul>
|
||||
<ul>
|
||||
<li>This is my <a href="/home.html">Homepage</a></li>
|
||||
<li><a href="#main-navigation">Main navigation</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="dropdown">
|
||||
<a id="other-dropdown" href="#" aria-label="My Org" class="dropdown-toggle"><span
|
||||
class="icon icon--right"></span> My organisation</a>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user