Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

This commit is contained in:
Christoph Auer 2024-10-15 14:58:10 +02:00
commit a66c4ee8eb
5 changed files with 91 additions and 25 deletions

View File

@ -94,7 +94,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
) )
doc = DoclingDocument( doc = DoclingDocument(
description=DescriptionItem(), name="name_without_extension", origin=origin description=DescriptionItem(), name="name_without_extension", origin=origin
) # TODO must add origin information ) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc) doc = self.walk_linear(self.pptx_obj, doc)
return doc return doc
@ -183,7 +183,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
e_is_a_list_item = False e_is_a_list_item = False
if e_is_a_list_item: if e_is_a_list_item:
# TODO: Set marker and enumerated arguments if this is an enumeration element. # Set marker and enumerated arguments if this is an enumeration element.
enum_marker = str(enum_list_item_value) + "." enum_marker = str(enum_list_item_value) + "."
doc.add_list_item( doc.add_list_item(
marker=enum_marker, marker=enum_marker,

View File

@ -44,6 +44,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents[i] = None self.parents[i] = None
self.level = 0 self.level = 0
self.listIter = 0
self.history = { self.history = {
"names": [None], "names": [None],
@ -124,6 +125,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
for element in body: for element in body:
tag_name = etree.QName(element).localname tag_name = etree.QName(element).localname
# Check for Inline Images (drawings or blip elements)
found_drawing = etree.ElementBase.xpath( found_drawing = etree.ElementBase.xpath(
element, ".//w:drawing", namespaces=self.xml_namespaces element, ".//w:drawing", namespaces=self.xml_namespaces
) )
@ -137,8 +139,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.handle_tables(element, docx_obj, doc) self.handle_tables(element, docx_obj, doc)
except Exception: except Exception:
_log.error("could not parse a table, broken docx table") _log.error("could not parse a table, broken docx table")
# Check for Inline Images (drawings or blip elements)
# elif element.xpath(".//w:drawing", namespaces = self.xml_namespaces) or element.xpath(".//w:pict", namespaces = self.xml_namespaces):
elif found_drawing or found_pict: elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc) self.handle_pictures(element, docx_obj, doc)
# Check for Text # Check for Text
@ -166,7 +167,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Get the numId element and extract the value # Get the numId element and extract the value
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap) numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap) ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
@ -215,15 +215,31 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text = paragraph.text.strip() text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists! # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# TODO: reliably identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_name, p_level = self.get_label_and_level(paragraph) p_style_name, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph) numid, ilevel = self.get_numId_and_ilvl(paragraph)
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
# print("paragraph.text: {} | numid: {} | ilevel: {}".format(paragraph.text, numid, ilevel)) if numid == 0:
numid = None
# Handle lists # Handle lists
if numid is not None and ilevel is not None: if numid is not None and ilevel is not None:
self.add_listitem( self.add_listitem(
element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text element,
docx_obj,
doc,
p_style_name,
p_level,
numid,
ilevel,
text,
is_numbered,
) )
self.update_history(p_style_name, p_level, numid, ilevel) self.update_history(p_style_name, p_level, numid, ilevel)
return return
@ -233,14 +249,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents[key] = None self.parents[key] = None
self.level = self.level_at_new_list - 1 self.level = self.level_at_new_list - 1
self.level_at_new_list = None self.level_at_new_list = None
if p_style_name in ["Title"]: if p_style_name in ["Title"]:
for key, val in self.parents.items(): for key, val in self.parents.items():
self.parents[key] = None self.parents[key] = None
self.parents[0] = doc.add_text( self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text parent=None, label=DocItemLabel.TITLE, text=text
) )
elif "Heading" in p_style_name: elif "Heading" in p_style_name:
self.add_header(element, docx_obj, doc, p_style_name, p_level, text) self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
@ -312,8 +326,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return return
def add_listitem( def add_listitem(
self, element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text: str self,
element,
docx_obj,
doc,
p_style_name,
p_level,
numid,
ilevel,
text: str,
is_numbered=False,
): ):
# is_numbered = is_numbered
enum_marker = ""
level = self.get_level() level = self.get_level()
if self.prev_numid() is None: # Open new list if self.prev_numid() is None: # Open new list
self.level_at_new_list = level # type: ignore self.level_at_new_list = level # type: ignore
@ -323,7 +349,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(parent=self.parents[level], text=text) self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level],
text=text,
)
elif ( elif (
self.prev_numid() == numid and self.prev_indent() < ilevel self.prev_numid() == numid and self.prev_indent() < ilevel
@ -334,12 +369,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
): ):
# TODO: determine if this is an unordered list or an ordered list. # TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits. # Set GroupLabel.ORDERED_LIST when it fits.
self.listIter = 0
if is_numbered:
self.parents[i] = doc.add_group(
label=GroupLabel.ORDERED_LIST,
name="list",
parent=self.parents[i - 1],
)
else:
self.parents[i] = doc.add_group( self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1] label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
) )
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item( doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel], parent=self.parents[self.level_at_new_list + ilevel],
text=text, text=text,
) )
@ -350,14 +399,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents[k] = None self.parents[k] = None
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item( doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel], parent=self.parents[self.level_at_new_list + ilevel],
text=text, text=text,
) )
self.listIter = 0
elif self.prev_numid() == numid or self.prev_indent() == ilevel: elif self.prev_numid() == numid or self.prev_indent() == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
doc.add_list_item(parent=self.parents[level - 1], text=text) self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level - 1],
text=text,
)
return return
def handle_tables(self, element, docx_obj, doc): def handle_tables(self, element, docx_obj, doc):

17
poetry.lock generated
View File

@ -885,7 +885,7 @@ files = []
develop = false develop = false
[package.dependencies] [package.dependencies]
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "6fee533a101ca08f62e88826218c33e0aab2f417"} docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"}
docutils = "!=0.21" docutils = "!=0.21"
matplotlib = "^3.7.1" matplotlib = "^3.7.1"
networkx = "^3.1" networkx = "^3.1"
@ -909,8 +909,8 @@ toolkit = ["deepsearch-toolkit (>=0.31.0)"]
[package.source] [package.source]
type = "git" type = "git"
url = "https://github.com/DS4SD/deepsearch-glm.git" url = "https://github.com/DS4SD/deepsearch-glm.git"
reference = "c13a6cdda25206911d63a5a28e990217ad823068" reference = "c185c4f985ccd29a470a1cddd3bec43880b739ee"
resolved_reference = "c13a6cdda25206911d63a5a28e990217ad823068" resolved_reference = "c185c4f985ccd29a470a1cddd3bec43880b739ee"
[[package]] [[package]]
name = "dill" name = "dill"
@ -952,14 +952,15 @@ json-schema-for-humans = "^1.0.0"
jsonref = "^1.1.0" jsonref = "^1.1.0"
jsonschema = "^4.16.0" jsonschema = "^4.16.0"
pandas = "^2.1.4" pandas = "^2.1.4"
pillow = "^10.3.0"
pydantic = "^2.6.0" pydantic = "^2.6.0"
tabulate = "^0.9.0" tabulate = "^0.9.0"
[package.source] [package.source]
type = "git" type = "git"
url = "https://github.com/DS4SD/docling-core.git" url = "https://github.com/DS4SD/docling-core.git"
reference = "6fee533a101ca08f62e88826218c33e0aab2f417" reference = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"
resolved_reference = "6fee533a101ca08f62e88826218c33e0aab2f417" resolved_reference = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"
[[package]] [[package]]
name = "docling-ibm-models" name = "docling-ibm-models"
@ -3441,9 +3442,9 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
] ]
[[package]] [[package]]
@ -3577,8 +3578,8 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
] ]
python-dateutil = ">=2.8.2" python-dateutil = ">=2.8.2"
pytz = ">=2020.1" pytz = ">=2020.1"
@ -7114,4 +7115,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "46f6c1eb76034223f7d65760f6ebe0989ba9e8aff46fcdbce82c147030fcb8be" content-hash = "14143d6cc79f4c2c8a4d021711198697e91ca01ecf290dd270b483984461c3d1"

View File

@ -37,9 +37,9 @@ torchvision = [
###################### ######################
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "6fee533a101ca08f62e88826218c33e0aab2f417"} docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "7c104d61aa5d003dd8d9711c37e23ce04799f4c9"}
docling-ibm-models = {git = "https://github.com/DS4SD/docling-ibm-models.git", rev = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"} docling-ibm-models = {git = "https://github.com/DS4SD/docling-ibm-models.git", rev = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"}
deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "c13a6cdda25206911d63a5a28e990217ad823068"} deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "c185c4f985ccd29a470a1cddd3bec43880b739ee"}
docling-parse = "^1.5.1" docling-parse = "^1.5.1"
filetype = "^1.2.0" filetype = "^1.2.0"

Binary file not shown.