fix: when .simplify_text_elements() always put a space between chunks, checks for alphanumeric characters creates more problems than it does good. commit new that testfiles that got forgotten in the last commit.

This commit is contained in:
Roman Kayan BAZG 2025-05-25 18:14:32 +02:00
parent 0c88c5b90f
commit 08beb406d9
16 changed files with 762 additions and 280 deletions

View File

@ -84,13 +84,6 @@ class AnnotatedTextList(list):
for i in range(1, len(self)): for i in range(1, len(self)):
if hyperlink == self[i].hyperlink: if hyperlink == self[i].hyperlink:
sep = " " sep = " "
if (
text
and re.match(r"\w", text[-1])
and self[i].text
and re.match(r"\w", self[i].text[0])
):
sep = " "
text += sep + self[i].text text += sep + self[i].text
else: else:
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink)) simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))

View File

@ -0,0 +1,6 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Something
item-2 at level 2: inline: group group
item-3 at level 3: text: Please follow the link to:
item-4 at level 3: text: This page
item-5 at level 3: text: .

View File

@ -0,0 +1,110 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "hyperlink_01",
"origin": {
"mimetype": "text/html",
"binary_hash": 17149231461445569313,
"filename": "hyperlink_01.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Something",
"text": "Something"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Please follow the link to:",
"text": "Please follow the link to:"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This page",
"text": "This page",
"hyperlink": "#"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": ".",
"text": "."
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,3 @@
# Something
Please follow the link to: [This page](#) .

View File

@ -0,0 +1,3 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-1
item-2 at level 2: section_header: Home

View File

@ -0,0 +1,83 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "hyperlink_02",
"origin": {
"mimetype": "text/html",
"binary_hash": 15683290523889238210,
"filename": "hyperlink_02.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "header-1",
"label": "section"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Home",
"text": "Home",
"hyperlink": "/home.html",
"level": 1
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1 @@
[## Home](/home.html)

View File

@ -0,0 +1,11 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: list: group list
item-2 at level 2: list_item: My Section
item-3 at level 3: list: group list
item-4 at level 4: list_item: Some page
item-5 at level 5: list: group list
item-6 at level 6: list_item: A sub page
item-7 at level 5: list: group list
item-8 at level 6: list_item: This is my Homepage
item-9 at level 6: list_item: Main navigation
item-10 at level 2: list_item: My organisation

View File

@ -0,0 +1,200 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "hyperlink_03",
"origin": {
"mimetype": "text/html",
"binary_hash": 14556394815653517177,
"filename": "hyperlink_03.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/texts/1"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "My Section",
"text": "My Section",
"hyperlink": "#",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Some page",
"text": "Some page",
"hyperlink": "/start.html",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "A sub page",
"text": "A sub page",
"hyperlink": "/home2.html",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "This is my Homepage",
"text": "This is my Homepage",
"hyperlink": "/home.html",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Main navigation",
"text": "Main navigation",
"hyperlink": "#main-navigation",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "My organisation",
"text": "My organisation",
"hyperlink": "#",
"enumerated": false,
"marker": "-"
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,6 @@
- [My Section](#)
- [Some page](/start.html)
- [A sub page](/home2.html)
- [This is my Homepage](/home.html)
- [Main navigation](#main-navigation)
- [My organisation](#)

View File

@ -251,7 +251,7 @@ item-0 at level 0: unspecified: group _root_
item-250 at level 3: inline: group group item-250 at level 3: inline: group group
item-251 at level 4: text: The word duck comes from item-251 at level 4: text: The word duck comes from
item-252 at level 4: text: Old English item-252 at level 4: text: Old English
item-253 at level 4: text: dūce'diver', a derivative of the ... because of the way many species in the item-253 at level 4: text: dūce 'diver', a derivative of th ... because of the way many species in the
item-254 at level 4: text: dabbling duck item-254 at level 4: text: dabbling duck
item-255 at level 4: text: group feed by upending; compare with item-255 at level 4: text: group feed by upending; compare with
item-256 at level 4: text: Dutch item-256 at level 4: text: Dutch
@ -261,7 +261,7 @@ item-0 at level 0: unspecified: group _root_
item-260 at level 3: picture item-260 at level 3: picture
item-260 at level 4: caption: Pacific black duck displaying the characteristic upending "duck" item-260 at level 4: caption: Pacific black duck displaying the characteristic upending "duck"
item-261 at level 3: inline: group group item-261 at level 3: inline: group group
item-262 at level 4: text: This word replaced Old English e ... r example, Dutch eend, German Ente and item-262 at level 4: text: This word replaced Old English e ... example, Dutch eend , German Ente and
item-263 at level 4: text: Norwegian item-263 at level 4: text: Norwegian
item-264 at level 4: text: and . The word ened / ænid was inherited from item-264 at level 4: text: and . The word ened / ænid was inherited from
item-265 at level 4: text: Proto-Indo-European item-265 at level 4: text: Proto-Indo-European
@ -523,7 +523,7 @@ item-0 at level 0: unspecified: group _root_
item-512 at level 5: text: [ 29 ] item-512 at level 5: text: [ 29 ]
item-513 at level 5: text: [ item-513 at level 5: text: [
item-514 at level 5: text: self-published source? item-514 at level 5: text: self-published source?
item-515 at level 5: text: ]but, despite widespread misconc ... , most species of duck do not "quack". item-515 at level 5: text: ] but, despite widespread miscon ... , most species of duck do not "quack".
item-516 at level 5: text: [ 30 ] item-516 at level 5: text: [ 30 ]
item-517 at level 5: text: In general, ducks make a range of item-517 at level 5: text: In general, ducks make a range of
item-518 at level 5: text: calls item-518 at level 5: text: calls

View File

@ -0,0 +1,17 @@
<html>
<body>
<h1>Something</h1>
<p>
Please follow the link to:
<a href="#">
<span class="icon icon--right"></span> This page
</a>
.
</p>
<div class="mod mod-contentpage">
</div>
</body>
</html>

View File

@ -0,0 +1,18 @@
<html>
<body>
<div class="nav-mobile-header">
<div class="table-row">
<span class="nav-mobile-logo">
<img src="/etc/designs/core/frontend/guidelines/img/xyz.svg"
onerror="this.onerror=null; this.src='/etc/designs/core/frontend/guidelines/img/xyz.png'"
alt="Image alt text" />
</span>
<h2>
<a href="/home.html" title="My home page " aria-label="My home page ">Home</a>
</h2>
</div>
</div>
</body>
</html>

View File

@ -0,0 +1,31 @@
<html>
<body>
<ul class="nav navbar-nav">
<li class="dropdown">
<a id="main-dropdown" href="#" aria-label="My Section" class="dropdown-toggle" data-toggle="dropdown"><span
class="icon icon--right"></span> My Section</a>
<ul class="dropdown-menu" role="menu">
<li class="dropdown-header">
<a href="/start.html" aria-label="Some page" target="_blank" title="">Some
page</a>
<ul>
<li>
<a href="/home2.html" aria-label="Some other page" target="_blank" title=""> A sub page</a>
</li>
</ul>
<ul>
<li>This is my <a href="/home.html">Homepage</a></li>
<li><a href="#main-navigation">Main navigation</a></li>
</ul>
</li>
</ul>
</li>
<li class="dropdown">
<a id="other-dropdown" href="#" aria-label="My Org" class="dropdown-toggle"><span
class="icon icon--right"></span> My organisation</a>
</li>
</ul>
</body>
</html>