mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
Merge branch 'main' into force_ocr
This commit is contained in:
commit
088ce5f696
@ -120,6 +120,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.handle_header(element, idx, doc)
|
self.handle_header(element, idx, doc)
|
||||||
elif element.name in ["p"]:
|
elif element.name in ["p"]:
|
||||||
self.handle_paragraph(element, idx, doc)
|
self.handle_paragraph(element, idx, doc)
|
||||||
|
elif element.name in ["pre"]:
|
||||||
|
self.handle_code(element, idx, doc)
|
||||||
elif element.name in ["ul", "ol"]:
|
elif element.name in ["ul", "ol"]:
|
||||||
self.handle_list(element, idx, doc)
|
self.handle_list(element, idx, doc)
|
||||||
elif element.name in ["li"]:
|
elif element.name in ["li"]:
|
||||||
@ -205,6 +207,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
level=hlevel,
|
level=hlevel,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def handle_code(self, element, idx, doc):
|
||||||
|
"""Handles monospace code snippets (pre)."""
|
||||||
|
if element.text is None:
|
||||||
|
return
|
||||||
|
text = element.text.strip()
|
||||||
|
label = DocItemLabel.CODE
|
||||||
|
if len(text) == 0:
|
||||||
|
return
|
||||||
|
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||||
|
|
||||||
def handle_paragraph(self, element, idx, doc):
|
def handle_paragraph(self, element, idx, doc):
|
||||||
"""Handles paragraph tags (p)."""
|
"""Handles paragraph tags (p)."""
|
||||||
if element.text is None:
|
if element.text is None:
|
||||||
|
@ -358,41 +358,36 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
|
|
||||||
size = Size(width=slide_width, height=slide_height)
|
size = Size(width=slide_width, height=slide_height)
|
||||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
||||||
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
|
|
||||||
|
|
||||||
# Loop through each shape in the slide
|
|
||||||
for shape in slide.shapes:
|
|
||||||
|
|
||||||
|
def handle_shapes(shape, parent_slide, slide_ind, doc):
|
||||||
|
handle_groups(shape, parent_slide, slide_ind, doc)
|
||||||
if shape.has_table:
|
if shape.has_table:
|
||||||
# Handle Tables
|
# Handle Tables
|
||||||
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
||||||
|
|
||||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||||
# Handle Tables
|
# Handle Pictures
|
||||||
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
||||||
|
|
||||||
# If shape doesn't have any text, move on to the next shape
|
# If shape doesn't have any text, move on to the next shape
|
||||||
if not hasattr(shape, "text"):
|
if not hasattr(shape, "text"):
|
||||||
continue
|
return
|
||||||
if shape.text is None:
|
if shape.text is None:
|
||||||
continue
|
return
|
||||||
if len(shape.text.strip()) == 0:
|
if len(shape.text.strip()) == 0:
|
||||||
continue
|
return
|
||||||
if not shape.has_text_frame:
|
if not shape.has_text_frame:
|
||||||
_log.warn("Warning: shape has text but not text_frame")
|
_log.warning("Warning: shape has text but not text_frame")
|
||||||
continue
|
return
|
||||||
|
|
||||||
# if shape.is_placeholder:
|
|
||||||
# Handle Titles (Headers) and Subtitles
|
|
||||||
# Check if the shape is a placeholder (titles are placeholders)
|
|
||||||
# self.handle_title(shape, parent_slide, slide_ind, doc)
|
|
||||||
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
|
||||||
# else:
|
|
||||||
|
|
||||||
# Handle other text elements, including lists (bullet lists, numbered lists)
|
# Handle other text elements, including lists (bullet lists, numbered lists)
|
||||||
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
||||||
|
return
|
||||||
|
|
||||||
# figures...
|
def handle_groups(shape, parent_slide, slide_ind, doc):
|
||||||
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||||
|
for groupedshape in shape.shapes:
|
||||||
|
handle_shapes(groupedshape, parent_slide, slide_ind, doc)
|
||||||
|
|
||||||
|
# Loop through each shape in the slide
|
||||||
|
for shape in slide.shapes:
|
||||||
|
handle_shapes(shape, parent_slide, slide_ind, doc)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
3
docs/concepts/index.md
Normal file
3
docs/concepts/index.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
In this area you can find guides on the main Docling concepts.
|
||||||
|
|
||||||
|
Use the navigation on the left to browse through them.
|
3
docs/examples/index.md
Normal file
3
docs/examples/index.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
In this area you can find examples covering a range of possible workflows and use cases.
|
||||||
|
|
||||||
|
Use the navigation on the left to browse through them.
|
3
docs/integrations/index.md
Normal file
3
docs/integrations/index.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
In this area you can find guides on the Docling integrations with popular frameworks and tools.
|
||||||
|
|
||||||
|
Use the navigation on the left to browse through them.
|
@ -39,7 +39,7 @@ theme:
|
|||||||
- content.code.copy
|
- content.code.copy
|
||||||
- announce.dismiss
|
- announce.dismiss
|
||||||
- navigation.tabs
|
- navigation.tabs
|
||||||
# - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
|
- navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
|
||||||
- navigation.instant
|
- navigation.instant
|
||||||
- navigation.instant.prefetch
|
- navigation.instant.prefetch
|
||||||
# - navigation.instant.preview
|
# - navigation.instant.preview
|
||||||
@ -57,9 +57,11 @@ nav:
|
|||||||
- Usage: usage.md
|
- Usage: usage.md
|
||||||
- Docling v2: v2.md
|
- Docling v2: v2.md
|
||||||
- Concepts:
|
- Concepts:
|
||||||
|
- Concepts: concepts/index.md
|
||||||
- Docling Document: concepts/docling_document.md
|
- Docling Document: concepts/docling_document.md
|
||||||
# - Chunking: concepts/chunking.md
|
# - Chunking: concepts/chunking.md
|
||||||
- Examples:
|
- Examples:
|
||||||
|
- Examples: examples/index.md
|
||||||
- Conversion:
|
- Conversion:
|
||||||
- "Simple conversion": examples/minimal.py
|
- "Simple conversion": examples/minimal.py
|
||||||
- "Custom conversion": examples/custom_convert.py
|
- "Custom conversion": examples/custom_convert.py
|
||||||
@ -78,6 +80,7 @@ nav:
|
|||||||
# - CLI:
|
# - CLI:
|
||||||
# - CLI: examples/cli.md
|
# - CLI: examples/cli.md
|
||||||
- Integrations:
|
- Integrations:
|
||||||
|
- Integrations: integrations/index.md
|
||||||
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
|
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
|
||||||
# - "LangChain 🦜🔗 extension": integrations/langchain.md
|
# - "LangChain 🦜🔗 extension": integrations/langchain.md
|
||||||
# - API reference:
|
# - API reference:
|
||||||
|
Loading…
Reference in New Issue
Block a user