Import statement updates from docling-core

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-15 10:11:10 +02:00
parent 8710506072
commit dac82ca7f2
29 changed files with 65 additions and 85 deletions

View File

@ -3,7 +3,7 @@ from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Set, Union
from docling_core.types.experimental import DoclingDocument
from docling_core.types.doc import DoclingDocument
if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat

View File

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

View File

@ -4,14 +4,14 @@ from pathlib import Path
from typing import Set, Union
from bs4 import BeautifulSoup
from docling_core.types.experimental import (
from docling_core.types.doc import (
DescriptionItem,
DoclingDocument,
PictureData,
TableCell,
TableData,
)
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat

View File

@ -3,7 +3,7 @@ from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.experimental import (
from docling_core.types.doc import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
@ -14,7 +14,7 @@ from docling_core.types.experimental import (
TableCell,
TableData,
)
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER

View File

@ -4,7 +4,7 @@ from pathlib import Path
from typing import Set, Union
import docx
from docling_core.types.experimental import (
from docling_core.types.doc import (
DescriptionItem,
DocItemLabel,
DoclingDocument,

View File

@ -2,8 +2,8 @@ from abc import ABC, abstractmethod
from io import BytesIO
from typing import Iterable, Optional, Set, Union
from docling_core.types.doc.doc_ocr import Path
from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.legacy_doc.doc_ocr import Path
from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend

View File

@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError

View File

@ -137,10 +137,10 @@ def convert(
from_formats: List[InputFormat] = typer.Option(
None,
"--from",
help="Specify input formats " "to convert from. Defaults to all formats.",
help="Specify input formats to convert from. Defaults to all formats.",
),
to_formats: List[OutputFormat] = typer.Option(
None, "--to", help="Specify output formats. " "Defaults to Markdown."
None, "--to", help="Specify output formats. Defaults to Markdown."
),
ocr: Annotated[
bool,
@ -148,9 +148,6 @@ def convert(
..., help="If enabled, the bitmap content will be processed using OCR."
),
] = True,
# backend: Annotated[
# Backend, typer.Option(..., help="The PDF backend to use.")
# ] = Backend.DOCLING,
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
@ -196,16 +193,6 @@ def convert(
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
# match backend:
# case Backend.PYPDFIUM2:
# do_cell_matching = ocr # only do cell matching when OCR enabled
# pdf_backend = PyPdfiumDocumentBackend
# case Backend.DOCLING:
# do_cell_matching = True
# pdf_backend = DoclingParseDocumentBackend
# case _:
# raise RuntimeError(f"Unexpected backend type {backend}")
match ocr_engine:
case OcrEngine.EASYOCR:
ocr_options = EasyOcrOptions()

View File

@ -2,9 +2,9 @@ from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.document import PictureData, TableCell
from docling_core.types.experimental.labels import DocItemLabel
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.document import PictureData, TableCell
from docling_core.types.doc.labels import DocItemLabel
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict

View File

@ -12,9 +12,7 @@ from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure, GlmTableCell, TableCell
from docling_core.types.experimental import (
from docling_core.types.doc import (
DescriptionItem,
DocItem,
DocItemLabel,
@ -24,7 +22,9 @@ from docling_core.types.experimental import (
TableItem,
TextItem,
)
from docling_core.types.experimental.document import ListItem
from docling_core.types.doc.document import ListItem
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
from docling_core.utils.file import resolve_file_source
from pydantic import BaseModel
from typing_extensions import deprecated

View File

@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from typing import Any, Iterable
from docling_core.types.experimental import DoclingDocument, NodeItem
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page

View File

@ -4,7 +4,7 @@ from abc import abstractmethod
from typing import Iterable, List, Tuple
import numpy as np
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.doc import BoundingBox, CoordOrigin
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label

View File

@ -15,10 +15,10 @@ from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure, TableCell
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.experimental.document import DoclingDocument
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import Figure, TableCell
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict

View File

@ -1,10 +1,7 @@
from typing import Any, Iterable
from docling_core.types.experimental import DoclingDocument, NodeItem
from docling_core.types.experimental.document import (
PictureClassificationData,
PictureItem,
)
from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc.document import PictureClassificationData, PictureItem
from docling.models.base_model import BaseEnrichmentModel

View File

@ -2,7 +2,7 @@ import logging
from typing import Iterable
import numpy
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import EasyOcrOptions

View File

@ -5,8 +5,8 @@ import time
from pathlib import Path
from typing import Iterable, List
from docling_core.types.experimental import CoordOrigin
from docling_core.types.experimental.labels import DocItemLabel
from docling_core.types.doc import CoordOrigin
from docling_core.types.doc.labels import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw

View File

@ -3,9 +3,9 @@ from pathlib import Path
from typing import Iterable, List
import numpy
from docling_core.types.experimental import BoundingBox
from docling_core.types.experimental.document import TableCell
from docling_core.types.experimental.labels import DocItemLabel
from docling_core.types.doc import BoundingBox
from docling_core.types.doc.document import TableCell
from docling_core.types.doc.labels import DocItemLabel
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw

View File

@ -5,7 +5,7 @@ from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Tuple
import pandas as pd
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import TesseractCliOcrOptions

View File

@ -1,7 +1,7 @@
import logging
from typing import Iterable
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import TesseractCliOcrOptions

View File

@ -5,7 +5,7 @@ import traceback
from abc import ABC, abstractmethod
from typing import Callable, Iterable, List
from docling_core.types.experimental import DoclingDocument, NodeItem
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend

View File

@ -1,8 +1,8 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table
from docling_core.types.experimental import BoundingBox, CoordOrigin
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.base_models import OcrCell
from docling.datamodel.document import ConversionResult, Page

View File

@ -2,7 +2,7 @@ import copy
import logging
import networkx as nx
from docling_core.types.experimental.labels import DocItemLabel
from docling_core.types.doc.labels import DocItemLabel
logger = logging.getLogger("layout_utils")

View File

@ -32,23 +32,23 @@ def export_documents(
doc_filename = conv_res.input.file.stem
if USE_V2:
# Export Docling document format to JSON (experimental):
# Export Docling document format to JSON:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.document.export_to_dict()))
# Export Docling document format to YAML (experimental):
# Export Docling document format to YAML:
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
# Export Docling document format to doctags (experimental):
# Export Docling document format to doctags:
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
fp.write(conv_res.document.export_to_document_tokens())
# Export Docling document format to markdown (experimental):
# Export Docling document format to markdown:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.document.export_to_markdown())
# Export Docling document format to text (experimental):
# Export Docling document format to text:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.document.export_to_markdown(strict_text=True))

View File

@ -58,8 +58,8 @@ for res in conv_results:
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}"
)
# print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental):
# print(res.docdocument.export_to_markdown())
# Export Docling document format to markdowndoc:
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.document.export_to_markdown())

32
poetry.lock generated
View File

@ -885,7 +885,7 @@ files = []
develop = false
[package.dependencies]
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "4ddecf80cf5afb4b1488172ecafcf12cb2b8cb9b"}
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "6fee533a101ca08f62e88826218c33e0aab2f417"}
docutils = "!=0.21"
matplotlib = "^3.7.1"
networkx = "^3.1"
@ -909,8 +909,8 @@ toolkit = ["deepsearch-toolkit (>=0.31.0)"]
[package.source]
type = "git"
url = "https://github.com/DS4SD/deepsearch-glm.git"
reference = "58c589fc23d675e8098f24ec680a9bf93e2a796e"
resolved_reference = "58c589fc23d675e8098f24ec680a9bf93e2a796e"
reference = "c13a6cdda25206911d63a5a28e990217ad823068"
resolved_reference = "c13a6cdda25206911d63a5a28e990217ad823068"
[[package]]
name = "dill"
@ -958,8 +958,8 @@ tabulate = "^0.9.0"
[package.source]
type = "git"
url = "https://github.com/DS4SD/docling-core.git"
reference = "4ddecf80cf5afb4b1488172ecafcf12cb2b8cb9b"
resolved_reference = "4ddecf80cf5afb4b1488172ecafcf12cb2b8cb9b"
reference = "6fee533a101ca08f62e88826218c33e0aab2f417"
resolved_reference = "6fee533a101ca08f62e88826218c33e0aab2f417"
[[package]]
name = "docling-ibm-models"
@ -2296,22 +2296,18 @@ transformers = ">=4.39.0"
[[package]]
name = "langchain-milvus"
version = "0.1.5"
version = "0.1.6"
description = "An integration package connecting Milvus and LangChain"
optional = false
python-versions = "<4.0,>=3.8.1"
python-versions = "<4.0,>=3.9"
files = [
{file = "langchain_milvus-0.1.5-py3-none-any.whl", hash = "sha256:74aa487738afde4c3e1346433ef26f9556e599826161562b308d3357d86529fd"},
{file = "langchain_milvus-0.1.5.tar.gz", hash = "sha256:1cceab384783ba264055102e5831451482fd726a68feb64258f6dbbd8d702557"},
{file = "langchain_milvus-0.1.6-py3-none-any.whl", hash = "sha256:efab3fcf613bd6151735e2c75f3264dba9daecb317b9bb22604c2aac579049a9"},
{file = "langchain_milvus-0.1.6.tar.gz", hash = "sha256:155979a6e5aeb94b0e141a12d2fdb4c34a4d7a0e5da2cec1ae7c9bccf6649205"},
]
[package.dependencies]
langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""}
langchain-core = ">=0.2.38,<0.4"
pymilvus = ">=2.4.3,<3.0.0"
scipy = [
{version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""},
{version = ">=1.7,<2.0", markers = "python_version < \"3.12\""},
]
[[package]]
name = "langchain-text-splitters"
@ -2329,13 +2325,13 @@ langchain-core = ">=0.2.38,<0.3.0"
[[package]]
name = "langsmith"
version = "0.1.134"
version = "0.1.135"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
optional = false
python-versions = "<4.0,>=3.8.1"
files = [
{file = "langsmith-0.1.134-py3-none-any.whl", hash = "sha256:ada98ad80ef38807725f32441a472da3dd28394010877751f48f458d3289da04"},
{file = "langsmith-0.1.134.tar.gz", hash = "sha256:23abee3b508875a0e63c602afafffc02442a19cfd88f9daae05b3e9054fd6b61"},
{file = "langsmith-0.1.135-py3-none-any.whl", hash = "sha256:b1d1ca3bad483a4239745c57e9b9157b4d099fbf3149be21e3d112c94ede06ac"},
{file = "langsmith-0.1.135.tar.gz", hash = "sha256:7abed7e141386af99a2177f0b3600b124ae3ad1b482879ba0724ce92ef998a11"},
]
[package.dependencies]
@ -7118,4 +7114,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "9678a9fb33ecbfbc6ec118fd3209aab5ab4e4c90d589e93c4dc7073dc9fb72ae"
content-hash = "46f6c1eb76034223f7d65760f6ebe0989ba9e8aff46fcdbce82c147030fcb8be"

View File

@ -37,9 +37,9 @@ torchvision = [
######################
python = "^3.10"
pydantic = "^2.0.0"
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "4ddecf80cf5afb4b1488172ecafcf12cb2b8cb9b"}
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "6fee533a101ca08f62e88826218c33e0aab2f417"}
docling-ibm-models = {git = "https://github.com/DS4SD/docling-ibm-models.git", rev = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"}
deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "58c589fc23d675e8098f24ec680a9bf93e2a796e"}
deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "c13a6cdda25206911d63a5a28e990217ad823068"}
docling-parse = "^1.5.1"
filetype = "^1.2.0"

View File

@ -1,7 +1,7 @@
from pathlib import Path
import pytest
from docling_core.types.experimental import BoundingBox
from docling_core.types.doc import BoundingBox
from docling.backend.docling_parse_backend import (
DoclingParseDocumentBackend,

View File

@ -1,7 +1,7 @@
from pathlib import Path
import pytest
from docling_core.types.experimental.base import BoundingBox
from docling_core.types.doc.base import BoundingBox
from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,

View File

@ -4,7 +4,7 @@ from pathlib import Path
from typing import List
from docling_core.types import Document as DsDocument
from docling_core.types.experimental import DoclingDocument
from docling_core.types.doc import DoclingDocument
from pydantic import TypeAdapter
from pydantic.json import pydantic_encoder