adding test_02.asciidoc

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-10-21 06:01:56 +02:00
parent e60c52586b
commit c23d049270
6 changed files with 782 additions and 499 deletions

View File

@ -76,7 +76,7 @@ class AsciidocBackend(DeclarativeDocumentBackend):
content = "" content = ""
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream.name, "r") as fr: with open(self.path_or_stream, "r") as fr:
self.lines = fr.readlines() self.lines = fr.readlines()
# self.lines = file_content.splitlines() # self.lines = file_content.splitlines()
@ -85,30 +85,50 @@ class AsciidocBackend(DeclarativeDocumentBackend):
in_table = False in_table = False
table_data = [] table_data = []
parents = {}
for i in range(0, 10):
parents[i] = None
for line in self.lines: for line in self.lines:
line = line.strip() line = line.strip()
# Title # Title
if self.is_title(line): if self.is_title(line):
item = self.parse_title(line) item = self.parse_title(line)
doc.add_text(text=item["text"], label=DocItemLabel.TITLE) level = item["level"]
parents[level] = doc.add_text(text=item["text"], label=DocItemLabel.TITLE)
# Section headers # Section headers
elif self.is_section_header(line): elif self.is_section_header(line):
heading = self.parse_section_header(line) item = self.parse_section_header(line)
doc.add_heading(text=heading["text"], level=heading["level"]) level = item["level"]
parents[level] = doc.add_heading(text=item["text"], level=item["level"], parent=parents[level-1])
for k,v in parents.items():
if k>level:
parents[k] = None
# Lists # Lists
elif self.is_list_item(line): elif self.is_list_item(line):
if not in_list: if not in_list:
in_list = True in_list = True
level = self.get_current_level(parents)
parents[level+1] = doc.add_group(
parent=parents[level], name="list", label=GroupLabel.LIST
)
item = self.parse_list_item(line) item = self.parse_list_item(line)
doc.add_list_item(item["text"]) doc.add_list_item(item["text"], parent=self.get_current_parent(parents))
elif in_list and not self.is_list_item(line): elif in_list and not self.is_list_item(line):
in_list = False in_list = False
level = self.get_current_level(parents)
parents[level]=None
# Tables # Tables
elif self.is_table_line(line): elif self.is_table_line(line):
in_table = True in_table = True
@ -117,42 +137,61 @@ class AsciidocBackend(DeclarativeDocumentBackend):
elif in_table and not self.is_table_line(line): elif in_table and not self.is_table_line(line):
data = self.populate_table_as_grid(table_data) data = self.populate_table_as_grid(table_data)
doc.add_table(data=data) doc.add_table(data=data, parent=self.get_current_parent(parents))
in_table = False in_table = False
table_data = [] table_data = []
# Plain text # Plain text
elif line: elif len(line)>0:
item = self.parse_text(line) item = self.parse_text(line)
doc.add_text(text=item["text"], label=DocItemLabel.TEXT) doc.add_text(text=item["text"], label=DocItemLabel.PARAGRAPH, parent=self.get_current_parent(parents))
if in_table and len(table_data) > 0: if in_table and len(table_data) > 0:
data = self.populate_table_as_grid(table_data) data = self.populate_table_as_grid(table_data)
doc.add_table(data=data) doc.add_table(data=data, parent=self.get_current_parent(parents))
in_table = False in_table = False
table_data = [] table_data = []
return doc return doc
def get_current_level(self, parents):
for k,v in parents.items():
if v==None and k>0:
return k-1
return 0
def get_current_parent(self, parents):
for k,v in parents.items():
if v==None and k>0:
return parents[k-1]
return None
# Title # Title
def is_title(self, line): def is_title(self, line):
return re.match(r"^= ", line) return re.match(r"^= ", line)
def parse_title(self, line): def parse_title(self, line):
return {"type": "title", "text": line[2:].strip()} return {"type": "title", "text": line[2:].strip(), "level":0}
# Section headers # Section headers
def is_section_header(self, line): def is_section_header(self, line):
return re.match(r"^==+", line) return re.match(r"^==+", line)
def parse_section_header(self, line): def parse_section_header(self, line):
header_level = line.count("=") # number of '=' represents level match = re.match(r"^(=+)\s+(.*)", line)
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
text = match.group(2) # The actual text of the list item
header_level = marker.count("=") # number of '=' represents level
return { return {
"type": "header", "type": "header",
"level": header_level, "level": header_level-1,
"text": line[header_level:].strip(), "text": text.strip(),
} }
# Lists # Lists
@ -160,7 +199,19 @@ class AsciidocBackend(DeclarativeDocumentBackend):
return re.match(r"^(\*|-|\d+\.|\w+\.) ", line) return re.match(r"^(\*|-|\d+\.|\w+\.) ", line)
def parse_list_item(self, line): def parse_list_item(self, line):
return {"type": "list_item", "text": line} """Extract the item marker (number or bullet symbol) and the text of the item."""
match = re.match(r"^(\*|-|\d+\.)\s+(.*)", line)
if match:
item_marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
item_text = match.group(2) # The actual text of the list item
if item_marker=="*" or item_marker=="-":
return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": False}
else:
return {"type": "list_item", "marker": item_marker, "text": item_text.strip(), "numbered": True}
else:
# Fallback if no match
return {"type": "list_item", "marker": item_marker, "text": line, "numbered": False}
# Tables # Tables
def is_table_line(self, line): def is_table_line(self, line):

1109
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -37,7 +37,8 @@ torchvision = [
###################### ######################
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = "^2.0.1" #docling-core = "^2.0.1"
docling-core = { git = "https://github.com/DS4SD/docling-core.git", rev = "334a9871fc6c666431ed6c59e5d8d69972b41f19" }
docling-ibm-models = "^2.0.1" docling-ibm-models = "^2.0.1"
deepsearch-glm = "^0.25.0" deepsearch-glm = "^0.25.0"
filetype = "^1.2.0" filetype = "^1.2.0"

View File

@ -1,9 +1,14 @@
= Sample Document Title = 1st Sample Document Title
This is an abstract.
== Section 1 == Section 1
This is some introductory text in section 1. This is some introductory text in section 1.
This spans multiple lines but should be treated
as a single paragraph.
=== Subsection 1.1 === Subsection 1.1
* First list item * First list item
* Second list item * Second list item

View File

@ -0,0 +1,68 @@
= 2nd Sample Document Title
This is an abstract.
== Section 1: Testing nestedlists
* First item
* Nested item 1
* Nested item 2
* Second item
1. Nested ordered item 1
2. Nested ordered item 2
* Deeper nested unordered item
* Third item
1. Nested ordered item 1
2. Nested ordered item 2
* Deeper nested unordered item
3. Nested ordered item 2
== Section 2
bla bla
==== SubSubSection 2.1.1
bla bla bla
== Section 3: test image
image::images/example1.png[Example Image, width=200, height=150, align=center]
.An example caption for the image
image::images/example2.png[Example Image, width=200, height=150, align=center]
== Section 4: test tables
|Header 1|Header 2|
|Value 1|Value 2|
|Value 3|Value 4|
.Caption for the table 1
|===
|Header 1 |Header 2
|Value 1 |Value 2
|Value 3 |Value 4
|===
.Caption for the table 2
|===
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|Cell 1 |Cell 2 |Cell 3
|Cell 4 |Cell 5 colspan=2|Cell spans two columns
|===
.Caption for the table 3
|===
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|Rowspan=2 |Cell 2 |Cell 3
| |Cell 5 |Cell 6
|===
.Caption for the table 4
|===
|Col 1 |Col 2 |Col 3 |Col 4
|Rowspan=2.Colspan=2|Cell spanning 2 rows and 2 columns |Col 3 |Col 4
| | |Col 3 |Col 4
|Col 1 |Col 2 |Col 3 |Col 4
|===

View File

@ -34,7 +34,14 @@ def test_asciidocs_examples():
doc_backend = _get_backend(Path(fname)) doc_backend = _get_backend(Path(fname))
doc = doc_backend.convert() doc = doc_backend.convert()
pred_itdoc = doc.export_to_indented_text(max_text_len=16)
print("\n\n", pred_itdoc)
pred_mddoc = doc.export_to_markdown() pred_mddoc = doc.export_to_markdown()
print("\n\n", pred_mddoc)
pred_mddocv2 = doc.export_to_markdown(version="v2")
print("\n\n", pred_mddocv2)
if os.path.exists(gname): if os.path.exists(gname):
with open(gname, "r") as fr: with open(gname, "r") as fr: