mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Documentation updates, remove DescriptionItem in DoclingDocument init
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
07206c5b3e
commit
734d77c8ae
@ -5,12 +5,12 @@ from typing import Set, Union
|
|||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DescriptionItem,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
|
GroupLabel,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# access self.path_or_stream to load stuff
|
# access self.path_or_stream to load stuff
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
doc = DoclingDocument(name="dummy")
|
||||||
_log.debug("Trying to convert HTML...")
|
_log.debug("Trying to convert HTML...")
|
||||||
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
|
@ -4,16 +4,17 @@ from pathlib import Path
|
|||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DescriptionItem,
|
BoundingBox,
|
||||||
|
CoordOrigin,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
ProvenanceItem,
|
ProvenanceItem,
|
||||||
|
Size,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
|
|
||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||||
|
|
||||||
@ -96,7 +97,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
else:
|
else:
|
||||||
docname = "stream"
|
docname = "stream"
|
||||||
doc = DoclingDocument(
|
doc = DoclingDocument(
|
||||||
description=DescriptionItem(), name=docname, origin=origin
|
name=docname, origin=origin
|
||||||
) # must add origin information
|
) # must add origin information
|
||||||
doc = self.walk_linear(self.pptx_obj, doc)
|
doc = self.walk_linear(self.pptx_obj, doc)
|
||||||
|
|
||||||
|
@ -5,7 +5,6 @@ from typing import Set, Union
|
|||||||
|
|
||||||
import docx
|
import docx
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DescriptionItem,
|
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
@ -99,9 +98,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
docname = Path(fname).stem
|
docname = Path(fname).stem
|
||||||
else:
|
else:
|
||||||
docname = "stream"
|
docname = "stream"
|
||||||
doc = DoclingDocument(
|
doc = DoclingDocument(name=docname, origin=origin)
|
||||||
description=DescriptionItem(), name=docname, origin=origin
|
|
||||||
)
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
assert self.docx_obj is not None
|
assert self.docx_obj is not None
|
||||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||||
|
@ -2,9 +2,13 @@ from enum import Enum, auto
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
|
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, Size
|
from docling_core.types.doc import (
|
||||||
from docling_core.types.doc.document import PictureDataType, TableCell
|
BoundingBox,
|
||||||
from docling_core.types.doc.labels import DocItemLabel
|
DocItemLabel,
|
||||||
|
PictureDataType,
|
||||||
|
Size,
|
||||||
|
TableCell,
|
||||||
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
|
@ -13,7 +13,6 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
|
|||||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||||
from docling_core.types import Table as DsSchemaTable
|
from docling_core.types import Table as DsSchemaTable
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DescriptionItem,
|
|
||||||
DocItem,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@ -37,11 +36,9 @@ from docling.datamodel.base_models import (
|
|||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
FigureElement,
|
|
||||||
InputFormat,
|
InputFormat,
|
||||||
MimeTypeToFormat,
|
MimeTypeToFormat,
|
||||||
Page,
|
Page,
|
||||||
PageElement,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docling.datamodel.settings import DocumentLimits
|
||||||
from docling.utils.utils import create_file_hash, create_hash
|
from docling.utils.utils import create_file_hash, create_hash
|
||||||
@ -70,9 +67,7 @@ layout_label_to_ds_type = {
|
|||||||
DocItemLabel.PARAGRAPH: "paragraph",
|
DocItemLabel.PARAGRAPH: "paragraph",
|
||||||
}
|
}
|
||||||
|
|
||||||
_EMPTY_DOCLING_DOC = DoclingDocument(
|
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
||||||
description=DescriptionItem(), name="dummy"
|
|
||||||
) # TODO: Stub
|
|
||||||
|
|
||||||
|
|
||||||
class InputDocument(BaseModel):
|
class InputDocument(BaseModel):
|
||||||
|
@ -11,8 +11,7 @@ from docling_core.types import DocumentDescription as DsDocumentDescription
|
|||||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||||
from docling_core.types import Table as DsSchemaTable
|
from docling_core.types import Table as DsSchemaTable
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
||||||
from docling_core.types.doc.document import DoclingDocument
|
|
||||||
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
||||||
from docling_core.types.legacy_doc.base import Figure, TableCell
|
from docling_core.types.legacy_doc.base import Figure, TableCell
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
@ -222,15 +221,8 @@ class GlmModel:
|
|||||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||||
|
|
||||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||||
# ds_doc_dict = to_legacy_document_format(
|
|
||||||
# glm_doc, ds_doc_dict, update_name_label=True
|
|
||||||
# )
|
|
||||||
|
|
||||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||||
# legacy_doc: DsLegacyDocument = None
|
|
||||||
|
|
||||||
# if self.create_legacy_output:
|
|
||||||
# legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
|
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_clusters_and_cells(ds_document, page_no):
|
def draw_clusters_and_cells(ds_document, page_no):
|
||||||
|
@ -5,8 +5,7 @@ import time
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
from docling_core.types.doc import CoordOrigin
|
from docling_core.types.doc import CoordOrigin, DocItemLabel
|
||||||
from docling_core.types.doc.labels import DocItemLabel
|
|
||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
@ -3,9 +3,7 @@ from pathlib import Path
|
|||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||||
from docling_core.types.doc.document import TableCell
|
|
||||||
from docling_core.types.doc.labels import DocItemLabel
|
|
||||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling_core.types.doc.document import DocItem, ImageRef, PictureItem, TableItem
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
|
@ -2,7 +2,7 @@ import copy
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
from docling_core.types.doc.labels import DocItemLabel
|
from docling_core.types.doc import DocItemLabel
|
||||||
|
|
||||||
logger = logging.getLogger("layout_utils")
|
logger = logging.getLogger("layout_utils")
|
||||||
|
|
||||||
|
BIN
docs/assets/docling_doc_hierarchy_1.png
Normal file
BIN
docs/assets/docling_doc_hierarchy_1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 369 KiB |
BIN
docs/assets/docling_doc_hierarchy_2.png
Normal file
BIN
docs/assets/docling_doc_hierarchy_2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 358 KiB |
64
docs/concepts/docling_format.md
Normal file
64
docs/concepts/docling_format.md
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
With Docling v2, we introduce a unified document representation format called `DoclingDocument`. It is defined as a
|
||||||
|
pydantic datatype, which can express several features common to documents, such as:
|
||||||
|
* Text, Tables, Pictures, and more
|
||||||
|
* Document hierarchy with sections and groups
|
||||||
|
* Disambiguation between main body and headers, footers (furniture)
|
||||||
|
* Layout information (i.e. bounding boxes) for all items, if available
|
||||||
|
* Provenance information
|
||||||
|
|
||||||
|
It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch.
|
||||||
|
|
||||||
|
# Example document structures
|
||||||
|
|
||||||
|
To illustrate the features of the `DoclingDocument` format, consider the following side-by-side comparison of a
|
||||||
|
`DoclingDocument` converted from `test/data/word_sample.docx`. Left side shows snippets from the converted document
|
||||||
|
serialized as YAML, right side shows the corresponding visual parts in MS Word.
|
||||||
|
|
||||||
|
## Basic structure
|
||||||
|
|
||||||
|
A `DoclingDocument` exposes top-level fields for the document content, organized in two categories.
|
||||||
|
The first category is the _content items_, which are stored in these fields:
|
||||||
|
|
||||||
|
- `texts`: All items that have a text representation (paragraph, section heading, equation, ...). Base class is `TextItem`.
|
||||||
|
- `tables`: All tables, type `TableItem`. Can carry structure annotations.
|
||||||
|
- `pictures`: All pictures, type `PictureItem`. Can carry structure annotations.
|
||||||
|
- `key_value_items`: All key-value items.
|
||||||
|
|
||||||
|
All of the above fields are lists and store items inheriting from the `DocItem` type. They can express different
|
||||||
|
data structures depending on their type, and reference parents and children through JSON pointers.
|
||||||
|
|
||||||
|
The second category is _content structure_, which is encapsualted in:
|
||||||
|
|
||||||
|
- `body`: The root node of a tree-structure for the main document body
|
||||||
|
- `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...)
|
||||||
|
- `groups`: A set of items that don't represent content, but act as containers for other content items (e.g. a list, a chapter)
|
||||||
|
|
||||||
|
All of the above fields are only storing `NodeItem` instances, which reference children and parents
|
||||||
|
through JSON pointers.
|
||||||
|
|
||||||
|
The reading order of the document is encapsulated through the `body` tree and the order of _children_ in each item
|
||||||
|
in the tree.
|
||||||
|
|
||||||
|
Below example shows how all items in the first page are nested below the `title` item (`#/texts/1`).
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Grouping
|
||||||
|
|
||||||
|
Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of
|
||||||
|
"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the
|
||||||
|
top-level `groups` field.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Tables
|
||||||
|
|
||||||
|
TBD
|
||||||
|
|
||||||
|
## Pictures
|
||||||
|
|
||||||
|
TBD
|
||||||
|
|
||||||
|
## Provenance
|
||||||
|
|
||||||
|
TBD
|
@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
from typing import Any, Iterable
|
from typing import Any, Iterable
|
||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||||
from docling_core.types.doc.document import PictureClassificationData, PictureItem, PictureClassificationClass
|
from docling_core.types.doc import PictureClassificationData, PictureItem, PictureClassificationClass
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
@ -2,7 +2,7 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling_core.types.doc.document import PictureItem, TableItem
|
from docling_core.types.doc import PictureItem, TableItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
@ -16,57 +16,58 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
USE_EXPERIMENTAL = False
|
def main():
|
||||||
|
input_paths = [
|
||||||
|
Path("tests/data/wiki_duck.html"),
|
||||||
|
Path("tests/data/word_sample.docx"),
|
||||||
|
Path("tests/data/lorem_ipsum.docx"),
|
||||||
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
|
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
|
Path("tests/data/2206.01062.pdf"),
|
||||||
|
]
|
||||||
|
|
||||||
input_paths = [
|
## for defaults use:
|
||||||
Path("tests/data/wiki_duck.html"),
|
# doc_converter = DocumentConverter()
|
||||||
Path("tests/data/word_sample.docx"),
|
|
||||||
Path("tests/data/lorem_ipsum.docx"),
|
|
||||||
Path("tests/data/powerpoint_sample.pptx"),
|
|
||||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
||||||
Path("tests/data/2206.01062.pdf"),
|
|
||||||
]
|
|
||||||
|
|
||||||
## for defaults use:
|
## to customize use:
|
||||||
# doc_converter = DocumentConverter()
|
|
||||||
|
|
||||||
## to customize use:
|
doc_converter = (
|
||||||
|
DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
doc_converter = (
|
allowed_formats=[
|
||||||
DocumentConverter( # all of the below is optional, has internal defaults.
|
InputFormat.PDF,
|
||||||
allowed_formats=[
|
InputFormat.IMAGE,
|
||||||
InputFormat.PDF,
|
InputFormat.DOCX,
|
||||||
InputFormat.IMAGE,
|
InputFormat.HTML,
|
||||||
InputFormat.DOCX,
|
InputFormat.PPTX,
|
||||||
InputFormat.HTML,
|
], # whitelist formats, non-matching files are ignored.
|
||||||
InputFormat.PPTX,
|
format_options={
|
||||||
], # whitelist formats, non-matching files are ignored.
|
InputFormat.PDF: PdfFormatOption(
|
||||||
format_options={
|
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||||
InputFormat.PDF: PdfFormatOption(
|
),
|
||||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
InputFormat.DOCX: WordFormatOption(
|
||||||
),
|
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||||
InputFormat.DOCX: WordFormatOption(
|
),
|
||||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
},
|
||||||
),
|
)
|
||||||
},
|
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(input_paths)
|
conv_results = doc_converter.convert_all(input_paths)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
print(
|
print(
|
||||||
f"Document {res.input.file.name} converted."
|
f"Document {res.input.file.name} converted."
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
f"\nSaved markdown output to: {str(out_path)}"
|
||||||
)
|
)
|
||||||
# print(res.docdocument.export_to_markdown())
|
# print(res.docdocument.export_to_markdown())
|
||||||
# Export Docling document format to markdowndoc:
|
# Export Docling document format to markdowndoc:
|
||||||
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
|
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
|
||||||
fp.write(res.document.export_to_markdown())
|
fp.write(res.document.export_to_markdown())
|
||||||
|
|
||||||
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
|
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
|
||||||
fp.write(json.dumps(res.document.export_to_dict()))
|
fp.write(json.dumps(res.document.export_to_dict()))
|
||||||
|
|
||||||
with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
|
with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
|
||||||
fp.write(yaml.safe_dump(res.document.export_to_dict()))
|
fp.write(yaml.safe_dump(res.document.export_to_dict()))
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -92,8 +92,9 @@ doc_converter = (
|
|||||||
**Note**: If you work only with defaults, all remains the same as in Docling v1.
|
**Note**: If you work only with defaults, all remains the same as in Docling v1.
|
||||||
|
|
||||||
More options are shown in the following example units:
|
More options are shown in the following example units:
|
||||||
- [run_with_formats.py](docs/examples/run_with_formats.py)
|
|
||||||
- [custom_convert.py](docs/examples/custom_convert.py)
|
- [run_with_formats.py](../examples/run_with_formats/)
|
||||||
|
- [custom_convert.py](../examples/custom_convert/)
|
||||||
|
|
||||||
### Converting documents
|
### Converting documents
|
||||||
|
|
||||||
|
@ -55,15 +55,17 @@ nav:
|
|||||||
- Home: index.md
|
- Home: index.md
|
||||||
- Installation: installation.md
|
- Installation: installation.md
|
||||||
- Docling v2: v2.md
|
- Docling v2: v2.md
|
||||||
# - Concepts:
|
- Concepts:
|
||||||
# - Docling Document: concepts/document.md
|
- The Docling Document format: concepts/docling_format.md
|
||||||
# - Chunking: concepts/chunking.md
|
# - Chunking: concepts/chunking.md
|
||||||
- Examples:
|
- Examples:
|
||||||
- Conversion:
|
- Conversion:
|
||||||
- "Simple conversion": examples/minimal.py
|
- "Simple conversion": examples/minimal.py
|
||||||
- "Custom conversion": examples/custom_convert.py
|
- "Custom conversion": examples/custom_convert.py
|
||||||
- "Batch conversion": examples/batch_convert.py
|
- "Batch conversion": examples/batch_convert.py
|
||||||
|
- "Multi-format conversion": examples/run_with_formats.py
|
||||||
- "Figure export": examples/export_figures.py
|
- "Figure export": examples/export_figures.py
|
||||||
|
- "Figure enrichment": examples/develop_picture_enrichment.py
|
||||||
- "Table export": examples/export_tables.py
|
- "Table export": examples/export_tables.py
|
||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
- RAG / QA:
|
- RAG / QA:
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
|||||||
{"schema_name": "DoclingDocument", "version": "1.0.0", "description": {}, "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 71.35887908935547, "t": 765.0995483398438, "r": 504.0870056152344, "b": 690.8582153320312, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
@ -1,7 +1,7 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from docling_core.types.doc.base import BoundingBox
|
from docling_core.types.doc import BoundingBox
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import (
|
from docling.backend.pypdfium2_backend import (
|
||||||
PyPdfiumDocumentBackend,
|
PyPdfiumDocumentBackend,
|
||||||
|
Loading…
Reference in New Issue
Block a user