From c1d9241b39012463dc836b99996efc02a6ffd693 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 18 Oct 2024 08:28:02 +0200 Subject: [PATCH] updated the asciidoc backend Signed-off-by: Peter Staar --- docling/backend/asciidoc_backend.py | 102 ++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 3aad25a7..f2aa4e74 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -1,3 +1,5 @@ +import re + import logging from io import BytesIO from pathlib import Path @@ -68,5 +70,105 @@ class ASCIIDocDocumentBackend(DeclarativeDocumentBackend): return doc def parse(self, doc: DoclingDocument): + """ + Main function that orchestrates the parsing by yielding components: + title, section headers, text, lists, and tables. + """ + content="" + with open(self.path_or_stream, "r") as fr: + self.lines = fr.read_lines() + + #self.lines = file_content.splitlines() + + in_list = False + in_table = False + table_data = [] + + for line in self.lines: + line = line.strip() + + # Title + if self.is_title(line): + item = self.parse_title(line) + doc.set_title(text=item["text"]) + + # Section headers + elif self.is_section_header(line): + heading = self.parse_section_header(line) + doc.add_heading(text=heading["text"], level=heading["level"]) + + # Lists + elif self.is_list_item(line): + if not in_list: + in_list = True + + item = self.parse_list_item(line) + doc.add_listitem(item["text"]) + + elif in_list and not self.is_list_item(line): + in_list = False + + # Tables + elif self.is_table_line(line): + in_table = True + table_data.append(self.parse_table_line(line)) + continue + + elif in_table and not self.is_table_line(line): + + grid = self.populate_table_as_grid(table_data) + doc.add_table(data=grid) + + in_table = False + table_data = [] + + # Plain text + elif line: + item = self.parse_text(line) + doc.add_text(text=item["text"]) + return doc + + # Title + def is_title(self, line): + return re.match(r"^= ", line) + + def parse_title(self, line): + return {"type": "title", "text": line[2:].strip()} + + # Section headers + def is_section_header(self, line): + return re.match(r"^==+", line) + + def parse_section_header(self, line): + header_level = line.count('=') # number of '=' represents level + return {"type": "header", "level": header_level, "text": line[header_level:].strip()} + + # Lists + def is_list_item(self, line): + return re.match(r"^(\*|-|\d+\.|\w+\.) ", line) + + def parse_list_item(self, line): + return {"type": "list_item", "text": line} + + # Tables + def is_table_line(self, line): + return re.match(r"^\|.*\|", line) + + def parse_table_line(self, line): + # Split table cells and trim extra spaces + return [cell.strip() for cell in line.split('|') if cell.strip()] + + def populate_table_as_grid(self, table_data): + # Adjust the table data into a grid format + max_cols = max(len(row) for row in table_data) + grid = [] + for row in table_data: + # Pad rows with empty strings to match column count + grid.append(row + [''] * (max_cols - len(row))) + return grid + + # Plain text + def parse_text(self, line): + return {"type": "text", "text": line}