Merge branch 'main' into force_ocr

This commit is contained in:
Nikos Livathinos 2024-11-11 17:47:02 +01:00
commit 088ce5f696
6 changed files with 42 additions and 23 deletions

View File

@ -120,6 +120,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
elif element.name in ["pre"]:
self.handle_code(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
@ -205,6 +207,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
level=hlevel,
)
def handle_code(self, element, idx, doc):
"""Handles monospace code snippets (pre)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.CODE
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:

View File

@ -358,41 +358,36 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
# Loop through each shape in the slide
for shape in slide.shapes:
def handle_shapes(shape, parent_slide, slide_ind, doc):
handle_groups(shape, parent_slide, slide_ind, doc)
if shape.has_table:
# Handle Tables
self.handle_tables(shape, parent_slide, slide_ind, doc)
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Tables
# Handle Pictures
self.handle_pictures(shape, parent_slide, slide_ind, doc)
# If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"):
continue
return
if shape.text is None:
continue
return
if len(shape.text.strip()) == 0:
continue
return
if not shape.has_text_frame:
_log.warn("Warning: shape has text but not text_frame")
continue
# if shape.is_placeholder:
# Handle Titles (Headers) and Subtitles
# Check if the shape is a placeholder (titles are placeholders)
# self.handle_title(shape, parent_slide, slide_ind, doc)
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
# else:
_log.warning("Warning: shape has text but not text_frame")
return
# Handle other text elements, including lists (bullet lists, numbered lists)
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
return
# figures...
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
def handle_groups(shape, parent_slide, slide_ind, doc):
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for groupedshape in shape.shapes:
handle_shapes(groupedshape, parent_slide, slide_ind, doc)
# Loop through each shape in the slide
for shape in slide.shapes:
handle_shapes(shape, parent_slide, slide_ind, doc)
return doc

3
docs/concepts/index.md Normal file
View File

@ -0,0 +1,3 @@
In this area you can find guides on the main Docling concepts.
Use the navigation on the left to browse through them.

3
docs/examples/index.md Normal file
View File

@ -0,0 +1,3 @@
In this area you can find examples covering a range of possible workflows and use cases.
Use the navigation on the left to browse through them.

View File

@ -0,0 +1,3 @@
In this area you can find guides on the Docling integrations with popular frameworks and tools.
Use the navigation on the left to browse through them.

View File

@ -39,7 +39,7 @@ theme:
- content.code.copy
- announce.dismiss
- navigation.tabs
# - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
- navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
- navigation.instant
- navigation.instant.prefetch
# - navigation.instant.preview
@ -57,9 +57,11 @@ nav:
- Usage: usage.md
- Docling v2: v2.md
- Concepts:
- Concepts: concepts/index.md
- Docling Document: concepts/docling_document.md
# - Chunking: concepts/chunking.md
- Examples:
- Examples: examples/index.md
- Conversion:
- "Simple conversion": examples/minimal.py
- "Custom conversion": examples/custom_convert.py
@ -78,6 +80,7 @@ nav:
# - CLI:
# - CLI: examples/cli.md
- Integrations:
- Integrations: integrations/index.md
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
# - "LangChain 🦜🔗 extension": integrations/langchain.md
# - API reference: