mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
adding test_02.asciidoc
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
e60c52586b
commit
c23d049270
@ -76,7 +76,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
content = ""
|
content = ""
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream.name, "r") as fr:
|
with open(self.path_or_stream, "r") as fr:
|
||||||
self.lines = fr.readlines()
|
self.lines = fr.readlines()
|
||||||
|
|
||||||
# self.lines = file_content.splitlines()
|
# self.lines = file_content.splitlines()
|
||||||
@ -85,30 +85,50 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
|
parents = {}
|
||||||
|
for i in range(0, 10):
|
||||||
|
parents[i] = None
|
||||||
|
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
if self.is_title(line):
|
if self.is_title(line):
|
||||||
item = self.parse_title(line)
|
item = self.parse_title(line)
|
||||||
doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
|
level = item["level"]
|
||||||
|
|
||||||
|
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
|
||||||
|
|
||||||
# Section headers
|
# Section headers
|
||||||
elif self.is_section_header(line):
|
elif self.is_section_header(line):
|
||||||
heading = self.parse_section_header(line)
|
item = self.parse_section_header(line)
|
||||||
doc.add_heading(text=heading["text"], level=heading["level"])
|
level = item["level"]
|
||||||
|
|
||||||
|
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
|
||||||
|
for k,v in parents.items():
|
||||||
|
if k>level:
|
||||||
|
parents[k] = None
|
||||||
|
|
||||||
# Lists
|
# Lists
|
||||||
elif self.is_list_item(line):
|
elif self.is_list_item(line):
|
||||||
if not in_list:
|
if not in_list:
|
||||||
in_list = True
|
in_list = True
|
||||||
|
|
||||||
|
level = self.get_current_level(parents)
|
||||||
|
|
||||||
|
parents[level+1] = doc.add_group(
|
||||||
|
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||||
|
)
|
||||||
|
|
||||||
item = self.parse_list_item(line)
|
item = self.parse_list_item(line)
|
||||||
doc.add_list_item(item["text"])
|
doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
|
||||||
|
|
||||||
elif in_list and not self.is_list_item(line):
|
elif in_list and not self.is_list_item(line):
|
||||||
in_list = False
|
in_list = False
|
||||||
|
|
||||||
|
level = self.get_current_level(parents)
|
||||||
|
parents[level]=None
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
elif self.is_table_line(line):
|
elif self.is_table_line(line):
|
||||||
in_table = True
|
in_table = True
|
||||||
@ -117,42 +137,61 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
elif in_table and not self.is_table_line(line):
|
elif in_table and not self.is_table_line(line):
|
||||||
|
|
||||||
data = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=data)
|
doc.add_table(data=data, parent=self.get_current_parent(parents))
|
||||||
|
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
# Plain text
|
# Plain text
|
||||||
elif line:
|
elif len(line)>0:
|
||||||
item = self.parse_text(line)
|
item = self.parse_text(line)
|
||||||
doc.add_text(text=item["text"], label=DocItemLabel.TEXT)
|
doc.add_text(text=item["text"], label=DocItemLabel.PARAGRAPH, parent=self.get_current_parent(parents))
|
||||||
|
|
||||||
if in_table and len(table_data) > 0:
|
if in_table and len(table_data) > 0:
|
||||||
data = self.populate_table_as_grid(table_data)
|
data = self.populate_table_as_grid(table_data)
|
||||||
doc.add_table(data=data)
|
doc.add_table(data=data, parent=self.get_current_parent(parents))
|
||||||
|
|
||||||
in_table = False
|
in_table = False
|
||||||
table_data = []
|
table_data = []
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
def get_current_level(self, parents):
|
||||||
|
for k,v in parents.items():
|
||||||
|
if v==None and k>0:
|
||||||
|
return k-1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_current_parent(self, parents):
|
||||||
|
for k,v in parents.items():
|
||||||
|
if v==None and k>0:
|
||||||
|
return parents[k-1]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
# Title
|
# Title
|
||||||
def is_title(self, line):
|
def is_title(self, line):
|
||||||
return re.match(r"^= ", line)
|
return re.match(r"^= ", line)
|
||||||
|
|
||||||
def parse_title(self, line):
|
def parse_title(self, line):
|
||||||
return {"type": "title", "text": line[2:].strip()}
|
return {"type": "title", "text": line[2:].strip(), "level":0}
|
||||||
|
|
||||||
# Section headers
|
# Section headers
|
||||||
def is_section_header(self, line):
|
def is_section_header(self, line):
|
||||||
return re.match(r"^==+", line)
|
return re.match(r"^==+", line)
|
||||||
|
|
||||||
def parse_section_header(self, line):
|
def parse_section_header(self, line):
|
||||||
header_level = line.count("=") # number of '=' represents level
|
match = re.match(r"^(=+)\s+(.*)", line)
|
||||||
|
|
||||||
|
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
||||||
|
text = match.group(2) # The actual text of the list item
|
||||||
|
|
||||||
|
header_level = marker.count("=") # number of '=' represents level
|
||||||
return {
|
return {
|
||||||
"type": "header",
|
"type": "header",
|
||||||
"level": header_level,
|
"level": header_level-1,
|
||||||
"text": line[header_level:].strip(),
|
"text": text.strip(),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Lists
|
# Lists
|
||||||
@ -160,8 +199,20 @@ class AsciidocBackend(DeclarativeDocumentBackend):
|
|||||||
return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
|
return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
|
||||||
|
|
||||||
def parse_list_item(self, line):
|
def parse_list_item(self, line):
|
||||||
return {"type": "list_item", "text": line}
|
"""Extract the item marker (number or bullet symbol) and the text of the item."""
|
||||||
|
|
||||||
|
match = re.match(r"^(\*|-|\d+\.)\s+(.*)", line)
|
||||||
|
if match:
|
||||||
|
item_marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
||||||
|
item_text = match.group(2) # The actual text of the list item
|
||||||
|
if item_marker=="*" or item_marker=="-":
|
||||||
|
return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": False}
|
||||||
|
else:
|
||||||
|
return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": True}
|
||||||
|
else:
|
||||||
|
# Fallback if no match
|
||||||
|
return {"type": "list_item", "marker": item_marker, "text": line, "numbered": False}
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
def is_table_line(self, line):
|
def is_table_line(self, line):
|
||||||
return re.match(r"^\|.*\|", line)
|
return re.match(r"^\|.*\|", line)
|
||||||
|
1109
poetry.lock
generated
1109
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -37,7 +37,8 @@ torchvision = [
|
|||||||
######################
|
######################
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = "^2.0.1"
|
#docling-core = "^2.0.1"
|
||||||
|
docling-core = { git = "https://github.com/DS4SD/docling-core.git", rev = "334a9871fc6c666431ed6c59e5d8d69972b41f19" }
|
||||||
docling-ibm-models = "^2.0.1"
|
docling-ibm-models = "^2.0.1"
|
||||||
deepsearch-glm = "^0.25.0"
|
deepsearch-glm = "^0.25.0"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
|
@ -1,8 +1,13 @@
|
|||||||
= Sample Document Title
|
= 1st Sample Document Title
|
||||||
|
|
||||||
|
This is an abstract.
|
||||||
|
|
||||||
== Section 1
|
== Section 1
|
||||||
|
|
||||||
This is some introductory text in section 1.
|
This is some introductory text in section 1.
|
||||||
|
|
||||||
|
This spans multiple lines but should be treated
|
||||||
|
as a single paragraph.
|
||||||
|
|
||||||
=== Subsection 1.1
|
=== Subsection 1.1
|
||||||
* First list item
|
* First list item
|
||||||
|
68
tests/data/test_02.asciidoc
Normal file
68
tests/data/test_02.asciidoc
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
= 2nd Sample Document Title
|
||||||
|
|
||||||
|
This is an abstract.
|
||||||
|
|
||||||
|
== Section 1: Testing nestedlists
|
||||||
|
|
||||||
|
* First item
|
||||||
|
* Nested item 1
|
||||||
|
* Nested item 2
|
||||||
|
* Second item
|
||||||
|
1. Nested ordered item 1
|
||||||
|
2. Nested ordered item 2
|
||||||
|
* Deeper nested unordered item
|
||||||
|
* Third item
|
||||||
|
1. Nested ordered item 1
|
||||||
|
2. Nested ordered item 2
|
||||||
|
* Deeper nested unordered item
|
||||||
|
3. Nested ordered item 2
|
||||||
|
|
||||||
|
== Section 2
|
||||||
|
|
||||||
|
bla bla
|
||||||
|
|
||||||
|
==== SubSubSection 2.1.1
|
||||||
|
|
||||||
|
bla bla bla
|
||||||
|
|
||||||
|
== Section 3: test image
|
||||||
|
|
||||||
|
image::images/example1.png[Example Image, width=200, height=150, align=center]
|
||||||
|
|
||||||
|
.An example caption for the image
|
||||||
|
image::images/example2.png[Example Image, width=200, height=150, align=center]
|
||||||
|
|
||||||
|
== Section 4: test tables
|
||||||
|
|
||||||
|
|Header 1|Header 2|
|
||||||
|
|Value 1|Value 2|
|
||||||
|
|Value 3|Value 4|
|
||||||
|
|
||||||
|
.Caption for the table 1
|
||||||
|
|===
|
||||||
|
|Header 1 |Header 2
|
||||||
|
|Value 1 |Value 2
|
||||||
|
|Value 3 |Value 4
|
||||||
|
|===
|
||||||
|
|
||||||
|
.Caption for the table 2
|
||||||
|
|===
|
||||||
|
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|
||||||
|
|Cell 1 |Cell 2 |Cell 3
|
||||||
|
|Cell 4 |Cell 5 colspan=2|Cell spans two columns
|
||||||
|
|===
|
||||||
|
|
||||||
|
.Caption for the table 3
|
||||||
|
|===
|
||||||
|
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|
||||||
|
|Rowspan=2 |Cell 2 |Cell 3
|
||||||
|
| |Cell 5 |Cell 6
|
||||||
|
|===
|
||||||
|
|
||||||
|
.Caption for the table 4
|
||||||
|
|===
|
||||||
|
|Col 1 |Col 2 |Col 3 |Col 4
|
||||||
|
|Rowspan=2.Colspan=2|Cell spanning 2 rows and 2 columns |Col 3 |Col 4
|
||||||
|
| | |Col 3 |Col 4
|
||||||
|
|Col 1 |Col 2 |Col 3 |Col 4
|
||||||
|
|===
|
@ -27,15 +27,22 @@ def test_asciidocs_examples():
|
|||||||
|
|
||||||
for fname in fnames:
|
for fname in fnames:
|
||||||
print(f"reading {fname}")
|
print(f"reading {fname}")
|
||||||
|
|
||||||
bname = os.path.basename(fname)
|
bname = os.path.basename(fname)
|
||||||
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
||||||
|
|
||||||
doc_backend = _get_backend(Path(fname))
|
doc_backend = _get_backend(Path(fname))
|
||||||
doc = doc_backend.convert()
|
doc = doc_backend.convert()
|
||||||
|
|
||||||
|
pred_itdoc = doc.export_to_indented_text(max_text_len=16)
|
||||||
|
print("\n\n", pred_itdoc)
|
||||||
|
|
||||||
pred_mddoc = doc.export_to_markdown()
|
pred_mddoc = doc.export_to_markdown()
|
||||||
|
print("\n\n", pred_mddoc)
|
||||||
|
|
||||||
|
pred_mddocv2 = doc.export_to_markdown(version="v2")
|
||||||
|
print("\n\n", pred_mddocv2)
|
||||||
|
|
||||||
if os.path.exists(gname):
|
if os.path.exists(gname):
|
||||||
with open(gname, "r") as fr:
|
with open(gname, "r") as fr:
|
||||||
true_mddoc = fr.read()
|
true_mddoc = fr.read()
|
||||||
|
Loading…
Reference in New Issue
Block a user