mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Merge branch 'main' into force_ocr
This commit is contained in:
commit
088ce5f696
@ -120,6 +120,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.handle_header(element, idx, doc)
|
||||
elif element.name in ["p"]:
|
||||
self.handle_paragraph(element, idx, doc)
|
||||
elif element.name in ["pre"]:
|
||||
self.handle_code(element, idx, doc)
|
||||
elif element.name in ["ul", "ol"]:
|
||||
self.handle_list(element, idx, doc)
|
||||
elif element.name in ["li"]:
|
||||
@ -205,6 +207,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
level=hlevel,
|
||||
)
|
||||
|
||||
def handle_code(self, element, idx, doc):
|
||||
"""Handles monospace code snippets (pre)."""
|
||||
if element.text is None:
|
||||
return
|
||||
text = element.text.strip()
|
||||
label = DocItemLabel.CODE
|
||||
if len(text) == 0:
|
||||
return
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
|
||||
def handle_paragraph(self, element, idx, doc):
|
||||
"""Handles paragraph tags (p)."""
|
||||
if element.text is None:
|
||||
|
@ -358,41 +358,36 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
size = Size(width=slide_width, height=slide_height)
|
||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
||||
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
|
||||
|
||||
# Loop through each shape in the slide
|
||||
for shape in slide.shapes:
|
||||
|
||||
def handle_shapes(shape, parent_slide, slide_ind, doc):
|
||||
handle_groups(shape, parent_slide, slide_ind, doc)
|
||||
if shape.has_table:
|
||||
# Handle Tables
|
||||
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
||||
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
# Handle Tables
|
||||
# Handle Pictures
|
||||
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
||||
|
||||
# If shape doesn't have any text, move on to the next shape
|
||||
if not hasattr(shape, "text"):
|
||||
continue
|
||||
return
|
||||
if shape.text is None:
|
||||
continue
|
||||
return
|
||||
if len(shape.text.strip()) == 0:
|
||||
continue
|
||||
return
|
||||
if not shape.has_text_frame:
|
||||
_log.warn("Warning: shape has text but not text_frame")
|
||||
continue
|
||||
|
||||
# if shape.is_placeholder:
|
||||
# Handle Titles (Headers) and Subtitles
|
||||
# Check if the shape is a placeholder (titles are placeholders)
|
||||
# self.handle_title(shape, parent_slide, slide_ind, doc)
|
||||
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
||||
# else:
|
||||
|
||||
_log.warning("Warning: shape has text but not text_frame")
|
||||
return
|
||||
# Handle other text elements, including lists (bullet lists, numbered lists)
|
||||
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
||||
return
|
||||
|
||||
# figures...
|
||||
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
|
||||
def handle_groups(shape, parent_slide, slide_ind, doc):
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||
for groupedshape in shape.shapes:
|
||||
handle_shapes(groupedshape, parent_slide, slide_ind, doc)
|
||||
|
||||
# Loop through each shape in the slide
|
||||
for shape in slide.shapes:
|
||||
handle_shapes(shape, parent_slide, slide_ind, doc)
|
||||
|
||||
return doc
|
||||
|
3
docs/concepts/index.md
Normal file
3
docs/concepts/index.md
Normal file
@ -0,0 +1,3 @@
|
||||
In this area you can find guides on the main Docling concepts.
|
||||
|
||||
Use the navigation on the left to browse through them.
|
3
docs/examples/index.md
Normal file
3
docs/examples/index.md
Normal file
@ -0,0 +1,3 @@
|
||||
In this area you can find examples covering a range of possible workflows and use cases.
|
||||
|
||||
Use the navigation on the left to browse through them.
|
3
docs/integrations/index.md
Normal file
3
docs/integrations/index.md
Normal file
@ -0,0 +1,3 @@
|
||||
In this area you can find guides on the Docling integrations with popular frameworks and tools.
|
||||
|
||||
Use the navigation on the left to browse through them.
|
@ -39,7 +39,7 @@ theme:
|
||||
- content.code.copy
|
||||
- announce.dismiss
|
||||
- navigation.tabs
|
||||
# - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
|
||||
- navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
|
||||
- navigation.instant
|
||||
- navigation.instant.prefetch
|
||||
# - navigation.instant.preview
|
||||
@ -57,9 +57,11 @@ nav:
|
||||
- Usage: usage.md
|
||||
- Docling v2: v2.md
|
||||
- Concepts:
|
||||
- Concepts: concepts/index.md
|
||||
- Docling Document: concepts/docling_document.md
|
||||
# - Chunking: concepts/chunking.md
|
||||
- Examples:
|
||||
- Examples: examples/index.md
|
||||
- Conversion:
|
||||
- "Simple conversion": examples/minimal.py
|
||||
- "Custom conversion": examples/custom_convert.py
|
||||
@ -78,6 +80,7 @@ nav:
|
||||
# - CLI:
|
||||
# - CLI: examples/cli.md
|
||||
- Integrations:
|
||||
- Integrations: integrations/index.md
|
||||
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
|
||||
# - "LangChain 🦜🔗 extension": integrations/langchain.md
|
||||
# - API reference:
|
||||
|
Loading…
Reference in New Issue
Block a user