mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 03:55:00 +00:00
Merge branch 'main' into nli/layoutmodel_improvements
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
commit
a553a1e5bf
13
.github/codecov.yml
vendored
13
.github/codecov.yml
vendored
@ -7,11 +7,12 @@ coverage:
|
||||
default:
|
||||
informational: true
|
||||
target: auto # auto compares coverage to the previous base commit
|
||||
if_ci_failed: success
|
||||
flags:
|
||||
- docling
|
||||
comment:
|
||||
layout: "reach, diff, flags, files"
|
||||
behavior: default
|
||||
require_changes: false # if true: only post the comment if coverage changes
|
||||
branches: # branch names that can post comment
|
||||
- "main"
|
||||
comment:
|
||||
layout: "reach, diff, flags, files"
|
||||
behavior: default
|
||||
require_changes: false # if true: only post the comment if coverage changes
|
||||
branches: # branch names that can post comment
|
||||
- "main"
|
||||
|
2
.github/workflows/checks.yml
vendored
2
.github/workflows/checks.yml
vendored
@ -46,7 +46,7 @@ jobs:
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
file: ./coverage.xml
|
||||
files: ./coverage.xml
|
||||
- name: Run examples
|
||||
run: |
|
||||
for file in docs/examples/*.py; do
|
||||
|
@ -22,6 +22,7 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://pepy.tech/projects/docling)
|
||||
[](https://apify.com/vancura/docling)
|
||||
[](https://www.bestpractices.dev/projects/10101)
|
||||
[](https://lfaidata.foundation/projects/)
|
||||
|
||||
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
||||
|
@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
# tags that generate NodeItem elements
|
||||
TAGS_FOR_NODE_ITEMS: Final = [
|
||||
"address",
|
||||
"details",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
||||
"ul",
|
||||
"ol",
|
||||
"li",
|
||||
"summary",
|
||||
"table",
|
||||
"figure",
|
||||
"img",
|
||||
@ -163,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
self.handle_header(tag, doc)
|
||||
elif tag.name in ["p"]:
|
||||
elif tag.name in ["p", "address", "summary"]:
|
||||
self.handle_paragraph(tag, doc)
|
||||
elif tag.name in ["pre", "code"]:
|
||||
self.handle_code(tag, doc)
|
||||
@ -177,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.handle_figure(tag, doc)
|
||||
elif tag.name == "img":
|
||||
self.handle_image(tag, doc)
|
||||
elif tag.name == "details":
|
||||
self.handle_details(tag, doc)
|
||||
else:
|
||||
self.walk(tag, doc)
|
||||
|
||||
@ -201,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return ["".join(result) + " "]
|
||||
|
||||
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handle details tag (details) and its content."""
|
||||
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
name="details",
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[self.level],
|
||||
content_layer=self.content_layer,
|
||||
)
|
||||
|
||||
self.level += 1
|
||||
self.walk(element, doc)
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
|
||||
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles header tags (h1, h2, etc.)."""
|
||||
hlevel = int(element.name.replace("h", ""))
|
||||
@ -258,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles paragraph tags (p)."""
|
||||
"""Handles paragraph tags (p) or equivalent ones."""
|
||||
if element.text is None:
|
||||
return
|
||||
text = element.text.strip()
|
||||
|
@ -421,7 +421,7 @@ def convert( # noqa: C901
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
elif verbose == 1:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
elif verbose == 2:
|
||||
else:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
settings.debug.visualize_cells = debug_visualize_cells
|
||||
|
@ -6,7 +6,7 @@ For each document format, the *document converter* knows which format-specific *
|
||||
|
||||
!!! tip
|
||||
|
||||
While the document converter holds a default mapping, this configuration is parametrizable, so e.g. for the PDF format, different backends and different pipeline options can be used — see [Usage](../usage.md#adjust-pipeline-features).
|
||||
While the document converter holds a default mapping, this configuration is parametrizable, so e.g. for the PDF format, different backends and different pipeline options can be used — see [Usage](../usage/index.md#adjust-pipeline-features).
|
||||
|
||||
The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.
|
||||
|
||||
|
@ -31,7 +31,7 @@ The first category is the _content items_, which are stored in these fields:
|
||||
All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
|
||||
data structures depending on their type, and reference parents and children through JSON pointers.
|
||||
|
||||
The second category is _content structure_, which is encapsualted in:
|
||||
The second category is _content structure_, which is encapsulated in:
|
||||
|
||||
- `body`: The root node of a tree-structure for the main document body
|
||||
- `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
|
||||
@ -49,7 +49,7 @@ Below example shows how all items in the first page are nested below the `title`
|
||||
|
||||
### Grouping
|
||||
|
||||
Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of
|
||||
Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as children. The children of
|
||||
"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the
|
||||
top-level `groups` field.
|
||||
|
||||
|
@ -80,7 +80,7 @@ for source in sources:
|
||||
fp.write(json.dumps(res.document.export_to_dict()))
|
||||
|
||||
res.document.save_as_json(
|
||||
out_path / f"{res.input.file.stem}.md",
|
||||
out_path / f"{res.input.file.stem}.json",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
[](https://github.com/pre-commit/pre-commit)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://pepy.tech/projects/docling)
|
||||
[](https://www.bestpractices.dev/projects/10101)
|
||||
[](https://lfaidata.foundation/projects/)
|
||||
|
||||
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
||||
|
2267
poetry.lock
generated
2267
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -93,6 +93,7 @@ pluggy = "^1.0.0"
|
||||
pylatexenc = "^2.10"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
python = "^3.9.2"
|
||||
black = { extras = ["jupyter"], version = "^24.4.2" }
|
||||
pytest = "^7.2.2"
|
||||
pre-commit = "^3.7.1"
|
||||
|
@ -4,4 +4,7 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-3 at level 1: text: This is a regular paragraph.
|
||||
item-4 at level 1: text: This is a third div
|
||||
with a new line.
|
||||
item-5 at level 1: text: This is a fourth div with a bold paragraph.
|
||||
item-5 at level 1: section: group details
|
||||
item-6 at level 2: text: Heading for the details element
|
||||
item-7 at level 2: text: Description of the details element.
|
||||
item-8 at level 1: text: This is a fourth div with a bold paragraph.
|
@ -4,7 +4,7 @@
|
||||
"name": "example_06",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 14574683870626799530,
|
||||
"binary_hash": 10224930410364781672,
|
||||
"filename": "example_06.html"
|
||||
},
|
||||
"furniture": {
|
||||
@ -30,14 +30,35 @@
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "details",
|
||||
"label": "section"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
@ -89,6 +110,30 @@
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Heading for the details element",
|
||||
"text": "Heading for the details element"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Description of the details element.",
|
||||
"text": "Description of the details element."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
|
@ -7,4 +7,8 @@ This is a regular paragraph.
|
||||
This is a third div
|
||||
with a new line.
|
||||
|
||||
Heading for the details element
|
||||
|
||||
Description of the details element.
|
||||
|
||||
This is a fourth div with a bold paragraph.
|
@ -7,6 +7,10 @@
|
||||
<div>This is another div with text.</div>
|
||||
<p>This is a regular paragraph.</p>
|
||||
<div>This is a third div<br/>with a new line.</div>
|
||||
<details>
|
||||
<summary>Heading for the details element</summary>
|
||||
<p>Description of the details element.</p>
|
||||
</details>
|
||||
<div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div>
|
||||
</body>
|
||||
</html>
|
||||
|
Loading…
Reference in New Issue
Block a user