mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-03 07:52:20 +00:00
MO-01 - Adding CSV backend support
This commit is contained in:
parent
447802b5d1
commit
4e17a51cf6
11
CITATION.cff
11
CITATION.cff
@ -2,14 +2,9 @@
|
|||||||
# Visit https://bit.ly/cffinit to generate yours today!
|
# Visit https://bit.ly/cffinit to generate yours today!
|
||||||
|
|
||||||
cff-version: 1.2.0
|
cff-version: 1.2.0
|
||||||
title: Docling
|
title: Dockowling
|
||||||
message: 'If you use Docling, please consider citing as below.'
|
message: 'If you use Dockowling, please consider citing as below.'
|
||||||
type: software
|
type: software
|
||||||
authors:
|
authors:
|
||||||
- name: Docling Team
|
- name: Docowling
|
||||||
identifiers:
|
|
||||||
- type: url
|
|
||||||
value: 'https://arxiv.org/abs/2408.09869'
|
|
||||||
description: 'arXiv:2408.09869'
|
|
||||||
repository-code: 'https://github.com/DS4SD/docling'
|
|
||||||
license: MIT
|
license: MIT
|
||||||
|
@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
|
|||||||
COPY docs/examples/minimal.py /root/minimal.py
|
COPY docs/examples/minimal.py /root/minimal.py
|
||||||
|
|
||||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||||
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
|
RUN python -c 'from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
|
||||||
|
|
||||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||||
ENV OMP_NUM_THREADS=4
|
ENV OMP_NUM_THREADS=4
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://github.com/ds4sd/docling">
|
<a href="https://github.com/mouraworks/docowling">
|
||||||
<img loading="lazy" alt="Docling" src="https://github.com/mouraworks/docowling/blob/main/docs/assets/docowling.png" width="80%"/>
|
<img loading="lazy" alt="Docling" src="https://github.com/mouraworks/docowling/blob/main/docs/assets/docowling.png" width="80%"/>
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Set, Union
|
|||||||
from docling_core.types.doc import DoclingDocument
|
from docling_core.types.doc import DoclingDocument
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
class AbstractDocumentBackend(ABC):
|
class AbstractDocumentBackend(ABC):
|
@ -16,9 +16,9 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
105
docowling/backend/csv_backend.py
Normal file
105
docowling/backend/csv_backend.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import csv
|
||||||
|
from io import StringIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union, Dict, Tuple, List
|
||||||
|
|
||||||
|
from docling_core.types.doc import (
|
||||||
|
DoclingDocument,
|
||||||
|
DocumentOrigin,
|
||||||
|
GroupLabel,
|
||||||
|
TableData,
|
||||||
|
TableCell,
|
||||||
|
)
|
||||||
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docowling.datamodel.base_models import InputFormat
|
||||||
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
|
class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[StringIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
self.rows = []
|
||||||
|
try:
|
||||||
|
# Load the CSV data
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
with self.path_or_stream.open(mode="r", encoding="utf-8") as file:
|
||||||
|
self.rows = list(csv.reader(file))
|
||||||
|
elif isinstance(self.path_or_stream, StringIO):
|
||||||
|
self.rows = list(csv.reader(self.path_or_stream))
|
||||||
|
|
||||||
|
self.valid = True
|
||||||
|
except Exception as e:
|
||||||
|
self.valid = False
|
||||||
|
raise RuntimeError(
|
||||||
|
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.valid
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False # Typically, CSV files do not support pagination.
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.CSV}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
origin = DocumentOrigin(
|
||||||
|
filename=self.file.name or "file.csv",
|
||||||
|
mimetype="text/csv",
|
||||||
|
binary_hash=self.document_hash,
|
||||||
|
)
|
||||||
|
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
|
||||||
|
|
||||||
|
if self.is_valid():
|
||||||
|
doc = self._convert_csv_to_document(doc)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||||
|
)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def _convert_csv_to_document(self, doc: DoclingDocument) -> DoclingDocument:
|
||||||
|
if not self.rows:
|
||||||
|
return doc # No data to process
|
||||||
|
|
||||||
|
# Create a section for the CSV data
|
||||||
|
self.parents[0] = doc.add_group(
|
||||||
|
parent=None,
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
name="CSV Data",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert rows into table data
|
||||||
|
num_rows = len(self.rows)
|
||||||
|
num_cols = max(len(row) for row in self.rows)
|
||||||
|
|
||||||
|
table_data = TableData(
|
||||||
|
num_rows=num_rows,
|
||||||
|
num_cols=num_cols,
|
||||||
|
table_cells=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
for row_idx, row in enumerate(self.rows):
|
||||||
|
for col_idx, cell in enumerate(row):
|
||||||
|
table_cell = TableCell(
|
||||||
|
text=cell,
|
||||||
|
row_span=1,
|
||||||
|
col_span=1,
|
||||||
|
start_row_offset_idx=row_idx,
|
||||||
|
end_row_offset_idx=row_idx + 1,
|
||||||
|
start_col_offset_idx=col_idx,
|
||||||
|
end_col_offset_idx=col_idx + 1,
|
||||||
|
col_header=False,
|
||||||
|
row_header=False,
|
||||||
|
)
|
||||||
|
table_data.table_cells.append(table_cell)
|
||||||
|
|
||||||
|
doc.add_table(data=table_data, parent=self.parents[0])
|
||||||
|
return doc
|
@ -10,9 +10,9 @@ from docling_parse.pdf_parsers import pdf_parser_v1
|
|||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell
|
from docowling.datamodel.base_models import Cell
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -10,11 +10,11 @@ from docling_parse.pdf_parsers import pdf_parser_v2
|
|||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell, Size
|
from docowling.datamodel.base_models import Cell, Size
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -13,9 +13,9 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -19,9 +19,9 @@ from docling_core.types.doc import (
|
|||||||
)
|
)
|
||||||
from marko import Markdown
|
from marko import Markdown
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -18,9 +18,9 @@ from openpyxl.cell.cell import Cell
|
|||||||
from openpyxl.drawing.image import Image
|
from openpyxl.drawing.image import Image
|
||||||
from openpyxl.worksheet.worksheet import Worksheet
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -20,12 +20,12 @@ from PIL import Image
|
|||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docowling.backend.abstract_backend import (
|
||||||
DeclarativeDocumentBackend,
|
DeclarativeDocumentBackend,
|
||||||
PaginatedDocumentBackend,
|
PaginatedDocumentBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -18,9 +18,9 @@ from lxml import etree
|
|||||||
from lxml.etree import XPath
|
from lxml.etree import XPath
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -6,9 +6,9 @@ from typing import Iterable, Optional, Set, Union
|
|||||||
from docling_core.types.doc import BoundingBox, Size
|
from docling_core.types.doc import BoundingBox, Size
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
from docowling.backend.abstract_backend import PaginatedDocumentBackend
|
||||||
from docling.datamodel.base_models import Cell, InputFormat
|
from docowling.datamodel.base_models import Cell, InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
class PdfPageBackend(ABC):
|
class PdfPageBackend(ABC):
|
@ -11,11 +11,11 @@ from PIL import Image, ImageDraw
|
|||||||
from pypdfium2 import PdfTextPage
|
from pypdfium2 import PdfTextPage
|
||||||
from pypdfium2._helpers.misc import PdfiumError
|
from pypdfium2._helpers.misc import PdfiumError
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell
|
from docowling.datamodel.base_models import Cell
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
6
docling/backend/xml/pubmed_backend.py → docowling/backend/xml/pubmed_backend.py
Executable file → Normal file
6
docling/backend/xml/pubmed_backend.py → docowling/backend/xml/pubmed_backend.py
Executable file → Normal file
@ -16,9 +16,9 @@ from docling_core.types.doc import (
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
from typing_extensions import TypedDict, override
|
from typing_extensions import TypedDict, override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -30,9 +30,9 @@ from docling_core.types.doc.document import LevelNumber
|
|||||||
from pydantic import NonNegativeInt
|
from pydantic import NonNegativeInt
|
||||||
from typing_extensions import Self, TypedDict, override
|
from typing_extensions import Self, TypedDict, override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -14,18 +14,18 @@ from docling_core.types.doc import ImageRefMode
|
|||||||
from docling_core.utils.file import resolve_source_to_path
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
from pydantic import TypeAdapter, ValidationError
|
from pydantic import TypeAdapter, ValidationError
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docowling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docowling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FormatToExtensions,
|
FormatToExtensions,
|
||||||
InputFormat,
|
InputFormat,
|
||||||
OutputFormat,
|
OutputFormat,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
@ -39,8 +39,8 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@ -15,7 +15,7 @@ from PIL.Image import Image
|
|||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.backend.pdf_backend import PdfPageBackend
|
from docowling.backend.pdf_backend import PdfPageBackend
|
||||||
|
|
||||||
|
|
||||||
class ConversionStatus(str, Enum):
|
class ConversionStatus(str, Enum):
|
||||||
@ -39,6 +39,7 @@ class InputFormat(str, Enum):
|
|||||||
ASCIIDOC = "asciidoc"
|
ASCIIDOC = "asciidoc"
|
||||||
MD = "md"
|
MD = "md"
|
||||||
XLSX = "xlsx"
|
XLSX = "xlsx"
|
||||||
|
CSV = "csv"
|
||||||
XML_USPTO = "xml_uspto"
|
XML_USPTO = "xml_uspto"
|
||||||
|
|
||||||
|
|
||||||
@ -60,6 +61,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||||
InputFormat.XLSX: ["xlsx"],
|
InputFormat.XLSX: ["xlsx"],
|
||||||
|
InputFormat.CSV: ["csv"],
|
||||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.XLSX: [
|
InputFormat.XLSX: [
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
],
|
],
|
||||||
|
InputFormat.CSV: ["text/csv"],
|
||||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||||
}
|
}
|
||||||
|
|
@ -47,11 +47,11 @@ from docling_core.utils.legacy import docling_document_to_legacy
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docowling.backend.abstract_backend import (
|
||||||
AbstractDocumentBackend,
|
AbstractDocumentBackend,
|
||||||
PaginatedDocumentBackend,
|
PaginatedDocumentBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import (
|
from docowling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
@ -62,12 +62,12 @@ from docling.datamodel.base_models import (
|
|||||||
MimeTypeToFormat,
|
MimeTypeToFormat,
|
||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docowling.datamodel.settings import DocumentLimits
|
||||||
from docling.utils.profiling import ProfilingItem
|
from docowling.utils.profiling import ProfilingItem
|
||||||
from docling.utils.utils import create_file_hash, create_hash
|
from docowling.utils.utils import create_file_hash, create_hash
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.document_converter import FormatOption
|
from docowling.document_converter import FormatOption
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -7,35 +7,36 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
|
|||||||
|
|
||||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docowling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
from docowling.backend.asciidoc_backend import AsciiDocBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docowling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docowling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
from docowling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docowling.backend.csv_backend import CsvDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docowling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
from docowling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
from docowling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||||
|
from docowling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DoclingComponentType,
|
DoclingComponentType,
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
InputFormat,
|
InputFormat,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import (
|
from docowling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
_DocumentConversionInput,
|
_DocumentConversionInput,
|
||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docowling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import DocumentLimits, settings
|
from docowling.datamodel.settings import DocumentLimits, settings
|
||||||
from docling.exceptions import ConversionError
|
from docowling.exceptions import ConversionError
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docowling.pipeline.base_pipeline import BasePipeline
|
||||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
from docowling.pipeline.simple_pipeline import SimplePipeline
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
from docling.utils.utils import chunkify
|
from docowling.utils.utils import chunkify
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -58,6 +59,9 @@ class ExcelFormatOption(FormatOption):
|
|||||||
pipeline_cls: Type = SimplePipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
||||||
|
|
||||||
|
class CsvFormatOption(FormatOption):
|
||||||
|
pipeline_cls: Type = SimplePipeline
|
||||||
|
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
||||||
|
|
||||||
class WordFormatOption(FormatOption):
|
class WordFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = SimplePipeline
|
pipeline_cls: Type = SimplePipeline
|
||||||
@ -109,6 +113,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|||||||
InputFormat.XLSX: FormatOption(
|
InputFormat.XLSX: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||||
),
|
),
|
||||||
|
InputFormat.CSV: FormatOption(
|
||||||
|
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
|
||||||
|
),
|
||||||
InputFormat.DOCX: FormatOption(
|
InputFormat.DOCX: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||||
),
|
),
|
@ -3,8 +3,8 @@ from typing import Any, Iterable
|
|||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docowling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
|
|
||||||
|
|
||||||
class BasePageModel(ABC):
|
class BasePageModel(ABC):
|
@ -10,11 +10,11 @@ from PIL import Image, ImageDraw
|
|||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import find_objects, label
|
from scipy.ndimage import find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrOptions
|
from docowling.datamodel.pipeline_options import OcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docowling.models.base_model import BasePageModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -24,18 +24,18 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docowling.datamodel.base_models import (
|
||||||
Cluster,
|
Cluster,
|
||||||
ContainerElement,
|
ContainerElement,
|
||||||
FigureElement,
|
FigureElement,
|
||||||
Table,
|
Table,
|
||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
from docowling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.utils.glm_utils import to_docling_document
|
from docowling.utils.glm_utils import to_docling_document
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
from docling.utils.utils import create_hash
|
from docowling.utils.utils import create_hash
|
||||||
|
|
||||||
|
|
||||||
class GlmOptions(BaseModel):
|
class GlmOptions(BaseModel):
|
@ -6,17 +6,17 @@ import numpy
|
|||||||
import torch
|
import torch
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docowling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docowling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -9,20 +9,20 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
|
|||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docowling.datamodel.base_models import (
|
||||||
BoundingBox,
|
BoundingBox,
|
||||||
Cell,
|
Cell,
|
||||||
Cluster,
|
Cluster,
|
||||||
LayoutPrediction,
|
LayoutPrediction,
|
||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
from docowling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docowling.models.base_model import BasePageModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docowling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
from docowling.utils.layout_postprocessor import LayoutPostprocessor
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -4,12 +4,12 @@ from typing import Iterable, Optional, Tuple
|
|||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docowling.datamodel.base_models import OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
from docowling.datamodel.pipeline_options import OcrMacOptions
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docowling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -4,7 +4,7 @@ from typing import Iterable, List
|
|||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docowling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
ContainerElement,
|
ContainerElement,
|
||||||
FigureElement,
|
FigureElement,
|
||||||
@ -13,10 +13,10 @@ from docling.datamodel.base_models import (
|
|||||||
Table,
|
Table,
|
||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.models.base_model import BasePageModel
|
from docowling.models.base_model import BasePageModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docowling.models.layout_model import LayoutModel
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -4,11 +4,11 @@ from typing import Iterable, Optional
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docowling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docowling.models.base_model import BasePageModel
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
class PagePreprocessingOptions(BaseModel):
|
class PagePreprocessingOptions(BaseModel):
|
@ -4,17 +4,17 @@ from typing import Iterable
|
|||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docowling.datamodel.base_models import OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
RapidOcrOptions,
|
RapidOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docowling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docowling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -7,18 +7,18 @@ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
|||||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
from docowling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
TableStructureOptions,
|
TableStructureOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docowling.models.base_model import BasePageModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docowling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
class TableStructureModel(BasePageModel):
|
class TableStructureModel(BasePageModel):
|
@ -9,12 +9,12 @@ from typing import Iterable, Optional, Tuple
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
from docowling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docowling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -3,12 +3,12 @@ from typing import Iterable
|
|||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
from docowling.datamodel.pipeline_options import TesseractOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docowling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docowling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -7,20 +7,20 @@ from typing import Callable, Iterable, List
|
|||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docowling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docowling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docowling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DoclingComponentType,
|
DoclingComponentType,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docowling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docowling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BaseEnrichmentModel
|
from docowling.models.base_model import BaseEnrichmentModel
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
from docling.utils.utils import chunkify
|
from docowling.utils.utils import chunkify
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -1,14 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docowling.backend.abstract_backend import (
|
||||||
AbstractDocumentBackend,
|
AbstractDocumentBackend,
|
||||||
DeclarativeDocumentBackend,
|
DeclarativeDocumentBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docowling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docowling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docowling.pipeline.base_pipeline import BasePipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -5,11 +5,11 @@ from typing import Optional
|
|||||||
|
|
||||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docowling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docowling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docowling.datamodel.base_models import AssembledUnit, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
@ -17,22 +17,22 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docowling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
from docowling.models.ds_glm_model import GlmModel, GlmOptions
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docowling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docowling.models.layout_model import LayoutModel
|
||||||
from docling.models.ocr_mac_model import OcrMacModel
|
from docowling.models.ocr_mac_model import OcrMacModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
from docowling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
from docling.models.page_preprocessing_model import (
|
from docowling.models.page_preprocessing_model import (
|
||||||
PagePreprocessingModel,
|
PagePreprocessingModel,
|
||||||
PagePreprocessingOptions,
|
PagePreprocessingOptions,
|
||||||
)
|
)
|
||||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
from docowling.models.rapid_ocr_model import RapidOcrModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docowling.models.table_structure_model import TableStructureModel
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
from docowling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
from docowling.models.tesseract_ocr_model import TesseractOcrModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docowling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -2,7 +2,7 @@ import logging
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import AcceleratorDevice
|
from docowling.datamodel.pipeline_options import AcceleratorDevice
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -4,8 +4,8 @@ from typing import Any, Dict, Iterable, List, Tuple, Union
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell
|
from docowling.datamodel.base_models import OcrCell
|
||||||
from docling.datamodel.document import ConversionResult, Page
|
from docowling.datamodel.document import ConversionResult, Page
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
|
|||||||
from docling_core.types.doc import DocItemLabel, Size
|
from docling_core.types.doc import DocItemLabel, Size
|
||||||
from rtree import index
|
from rtree import index
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
from docowling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, List
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
|
|
||||||
|
|
||||||
class ProfilingScope(str, Enum):
|
class ProfilingScope(str, Enum):
|
@ -28,7 +28,7 @@ The `BaseChunker` base class API defines that any chunker should provide the fol
|
|||||||
|
|
||||||
- If you are using the `docling` package, you can import as follows:
|
- If you are using the `docling` package, you can import as follows:
|
||||||
```python
|
```python
|
||||||
from docling.chunking import HybridChunker
|
from docowling.chunking import HybridChunker
|
||||||
```
|
```
|
||||||
- If you are only using the `docling-core` package, you must ensure to install
|
- If you are only using the `docling-core` package, you must ensure to install
|
||||||
the `chunking` extra, e.g.
|
the `chunking` extra, e.g.
|
||||||
|
@ -6,10 +6,10 @@ from typing import Iterable
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docowling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -3,13 +3,13 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.models.ocr_mac_model import OcrMacOptions
|
from docowling.models.ocr_mac_model import OcrMacOptions
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
from docowling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
from docowling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -10,11 +10,11 @@ from docling_core.types.doc import (
|
|||||||
PictureItem,
|
PictureItem,
|
||||||
)
|
)
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.models.base_model import BaseEnrichmentModel
|
from docowling.models.base_model import BaseEnrichmentModel
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
|
|
||||||
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
|
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
|
||||||
|
@ -4,9 +4,9 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
from docowling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -5,11 +5,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.utils.export import generate_multimodal_pages
|
from docowling.utils.export import generate_multimodal_pages
|
||||||
from docling.utils.utils import create_hash
|
from docowling.utils.utils import create_hash
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -37,7 +37,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docowling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n",
|
"DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -68,7 +68,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from transformers import AutoTokenizer\n",
|
"from transformers import AutoTokenizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.chunking import HybridChunker\n",
|
"from docowling.chunking import HybridChunker\n",
|
||||||
"\n",
|
"\n",
|
||||||
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||||
"MAX_TOKENS = 64\n",
|
"MAX_TOKENS = 64\n",
|
||||||
@ -404,7 +404,7 @@
|
|||||||
" return tbl\n",
|
" return tbl\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"db_uri = str(Path(mkdtemp()) / \"docling.db\")\n",
|
"db_uri = str(Path(mkdtemp()) / \"docowling.db\")\n",
|
||||||
"index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n",
|
"index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"sample_query = \"invent\"\n",
|
"sample_query = \"invent\"\n",
|
||||||
|
@ -81,8 +81,8 @@
|
|||||||
"from docling_core.transforms.chunker import HierarchicalChunker\n",
|
"from docling_core.transforms.chunker import HierarchicalChunker\n",
|
||||||
"from qdrant_client import QdrantClient\n",
|
"from qdrant_client import QdrantClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.datamodel.base_models import InputFormat\n",
|
"from docowling.datamodel.base_models import InputFormat\n",
|
||||||
"from docling.document_converter import DocumentConverter"
|
"from docowling.document_converter import DocumentConverter"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||||
converter = DocumentConverter()
|
converter = DocumentConverter()
|
||||||
|
@ -110,7 +110,7 @@
|
|||||||
"EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
|
"EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
|
||||||
"QUESTION = \"Which are the main AI models in Docling?\"\n",
|
"QUESTION = \"Which are the main AI models in Docling?\"\n",
|
||||||
"TOP_K = 3\n",
|
"TOP_K = 3\n",
|
||||||
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")"
|
"MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -168,7 +168,7 @@
|
|||||||
"from haystack.components.writers import DocumentWriter\n",
|
"from haystack.components.writers import DocumentWriter\n",
|
||||||
"from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n",
|
"from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.chunking import HybridChunker\n",
|
"from docowling.chunking import HybridChunker\n",
|
||||||
"\n",
|
"\n",
|
||||||
"document_store = MilvusDocumentStore(\n",
|
"document_store = MilvusDocumentStore(\n",
|
||||||
" connection_args={\"uri\": MILVUS_URI},\n",
|
" connection_args={\"uri\": MILVUS_URI},\n",
|
||||||
@ -329,7 +329,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling.chunking import DocChunk\n",
|
"from docowling.chunking import DocChunk\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(f\"Question:\\n{QUESTION}\\n\")\n",
|
"print(f\"Question:\\n{QUESTION}\\n\")\n",
|
||||||
"print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n",
|
"print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n",
|
||||||
|
@ -83,7 +83,7 @@
|
|||||||
"from langchain_core.document_loaders import BaseLoader\n",
|
"from langchain_core.document_loaders import BaseLoader\n",
|
||||||
"from langchain_core.documents import Document as LCDocument\n",
|
"from langchain_core.documents import Document as LCDocument\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docowling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class DoclingPDFLoader(BaseLoader):\n",
|
"class DoclingPDFLoader(BaseLoader):\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -117,7 +117,7 @@
|
|||||||
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
|
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
|
||||||
"\n",
|
"\n",
|
||||||
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
|
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
|
||||||
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n",
|
"MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")\n",
|
||||||
"GEN_MODEL = HuggingFaceInferenceAPI(\n",
|
"GEN_MODEL = HuggingFaceInferenceAPI(\n",
|
||||||
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
|
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
|
||||||
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
|
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
|
||||||
@ -182,7 +182,7 @@
|
|||||||
"node_parser = MarkdownNodeParser()\n",
|
"node_parser = MarkdownNodeParser()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"vector_store = MilvusVectorStore(\n",
|
"vector_store = MilvusVectorStore(\n",
|
||||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
|
||||||
" dim=embed_dim,\n",
|
" dim=embed_dim,\n",
|
||||||
" overwrite=True,\n",
|
" overwrite=True,\n",
|
||||||
")\n",
|
")\n",
|
||||||
@ -282,7 +282,7 @@
|
|||||||
"node_parser = DoclingNodeParser()\n",
|
"node_parser = DoclingNodeParser()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"vector_store = MilvusVectorStore(\n",
|
"vector_store = MilvusVectorStore(\n",
|
||||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
|
||||||
" dim=embed_dim,\n",
|
" dim=embed_dim,\n",
|
||||||
" overwrite=True,\n",
|
" overwrite=True,\n",
|
||||||
")\n",
|
")\n",
|
||||||
@ -423,7 +423,7 @@
|
|||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"vector_store = MilvusVectorStore(\n",
|
"vector_store = MilvusVectorStore(\n",
|
||||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
|
||||||
" dim=embed_dim,\n",
|
" dim=embed_dim,\n",
|
||||||
" overwrite=True,\n",
|
" overwrite=True,\n",
|
||||||
")\n",
|
")\n",
|
||||||
|
@ -207,8 +207,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling.datamodel.document import ConversionResult\n",
|
"from docowling.datamodel.document import ConversionResult\n",
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docowling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Instantiate the doc converter\n",
|
"# Instantiate the doc converter\n",
|
||||||
"doc_converter = DocumentConverter()\n",
|
"doc_converter = DocumentConverter()\n",
|
||||||
|
@ -5,9 +5,9 @@ from pathlib import Path
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docowling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docowling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -4,15 +4,15 @@ from pathlib import Path
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import (
|
from docowling.document_converter import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
WordFormatOption,
|
WordFormatOption,
|
||||||
)
|
)
|
||||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
from docowling.pipeline.simple_pipeline import SimplePipeline
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -140,7 +140,7 @@ This is a collection of FAQ collected from the user questions on <https://github
|
|||||||
Setting the OCR language in Docling is done via the OCR pipeline options:
|
Setting the OCR language in Docling is done via the OCR pipeline options:
|
||||||
|
|
||||||
```py
|
```py
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR
|
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR
|
||||||
|
@ -36,9 +36,9 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
|||||||
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docowling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
from docowling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PipelineOptions()
|
||||||
pipeline_options.do_ocr = True
|
pipeline_options.do_ocr = True
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
This page provides documentation for our command line tools.
|
This page provides documentation for our command line tools.
|
||||||
|
|
||||||
::: mkdocs-click
|
::: mkdocs-click
|
||||||
:module: docling.cli.main
|
:module: docowling.cli.main
|
||||||
:command: click_app
|
:command: click_app
|
||||||
:prog_name: docling
|
:prog_name: docling
|
||||||
:style: table
|
:style: table
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
This is an automatic generated API reference of the main components of Docling.
|
This is an automatic generated API reference of the main components of Docling.
|
||||||
|
|
||||||
::: docling.document_converter
|
::: docowling.document_converter
|
||||||
handler: python
|
handler: python
|
||||||
options:
|
options:
|
||||||
members:
|
members:
|
||||||
|
@ -8,7 +8,7 @@ can be enabled with `do_xyz = True`.
|
|||||||
This is an automatic generated API reference of the all the pipeline options available in Docling.
|
This is an automatic generated API reference of the all the pipeline options available in Docling.
|
||||||
|
|
||||||
|
|
||||||
::: docling.datamodel.pipeline_options
|
::: docowling.datamodel.pipeline_options
|
||||||
handler: python
|
handler: python
|
||||||
options:
|
options:
|
||||||
show_if_no_docstring: true
|
show_if_no_docstring: true
|
||||||
@ -28,7 +28,7 @@ This is an automatic generated API reference of the all the pipeline options ava
|
|||||||
signature_crossrefs: true
|
signature_crossrefs: true
|
||||||
summary: true
|
summary: true
|
||||||
|
|
||||||
<!-- ::: docling.document_converter.DocumentConverter
|
<!-- ::: docowling.document_converter.DocumentConverter
|
||||||
handler: python
|
handler: python
|
||||||
options:
|
options:
|
||||||
show_if_no_docstring: true
|
show_if_no_docstring: true
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
To convert individual PDF documents, use `convert()`, for example:
|
To convert individual PDF documents, use `convert()`, for example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||||
converter = DocumentConverter()
|
converter = DocumentConverter()
|
||||||
@ -39,9 +39,9 @@ This can improve output quality if you find that multiple columns in extracted t
|
|||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions(do_table_structure=True)
|
pipeline_options = PdfPipelineOptions(do_table_structure=True)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
||||||
@ -56,9 +56,9 @@ doc_converter = DocumentConverter(
|
|||||||
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
|
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions(do_table_structure=True)
|
pipeline_options = PdfPipelineOptions(do_table_structure=True)
|
||||||
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
|
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
|
||||||
@ -75,10 +75,10 @@ doc_converter = DocumentConverter(
|
|||||||
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
|
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
|
|
||||||
# # to explicitly prefetch:
|
# # to explicitly prefetch:
|
||||||
# artifacts_path = StandardPdfPipeline.download_models_hf()
|
# artifacts_path = StandardPdfPipeline.download_models_hf()
|
||||||
@ -99,7 +99,7 @@ You can limit the file size and number of pages which should be allowed to proce
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869"
|
source = "https://arxiv.org/pdf/2408.09869"
|
||||||
converter = DocumentConverter()
|
converter = DocumentConverter()
|
||||||
@ -112,8 +112,8 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from docling.datamodel.base_models import DocumentStream
|
from docowling.datamodel.base_models import DocumentStream
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
buf = BytesIO(your_binary_stream)
|
buf = BytesIO(your_binary_stream)
|
||||||
source = DocumentStream(name="my_doc.pdf", stream=buf)
|
source = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
@ -133,8 +133,8 @@ You can chunk a Docling document using a [chunker](concepts/chunking.md), such a
|
|||||||
[this example](examples/hybrid_chunking.ipynb)):
|
[this example](examples/hybrid_chunking.ipynb)):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
from docling.chunking import HybridChunker
|
from docowling.chunking import HybridChunker
|
||||||
|
|
||||||
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
|
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
|
||||||
doc = conv_res.document
|
doc = conv_res.document
|
||||||
|
16
docs/v2.md
16
docs/v2.md
@ -46,17 +46,17 @@ Format options can include the pipeline class to use, the options to provide to
|
|||||||
They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
|
They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.document_converter import (
|
from docowling.document_converter import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
WordFormatOption,
|
WordFormatOption,
|
||||||
)
|
)
|
||||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
from docowling.pipeline.simple_pipeline import SimplePipeline
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
|
||||||
## Default initialization still works as before:
|
## Default initialization still works as before:
|
||||||
# doc_converter = DocumentConverter()
|
# doc_converter = DocumentConverter()
|
||||||
@ -110,7 +110,7 @@ or `DocumentStream` objects, without constructing a `DocumentConversionInput` ob
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
...
|
...
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
## Convert a single file (from URL or local path)
|
## Convert a single file (from URL or local path)
|
||||||
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
||||||
|
|
||||||
|
@ -118,7 +118,7 @@ ocrmac = ["ocrmac"]
|
|||||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
docling = "docling.cli.main:app"
|
docling = "docowling.cli.main:app"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
|
@ -2,9 +2,9 @@ import glob
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
from docowling.backend.asciidoc_backend import AsciiDocBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(fname):
|
def _get_backend(fname):
|
||||||
|
@ -3,12 +3,12 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
from docling_core.types.doc import BoundingBox
|
from docling_core.types.doc import BoundingBox
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import (
|
from docowling.backend.docling_parse_backend import (
|
||||||
DoclingParseDocumentBackend,
|
DoclingParseDocumentBackend,
|
||||||
DoclingParsePageBackend,
|
DoclingParsePageBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -2,12 +2,12 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_v2_backend import (
|
from docowling.backend.docling_parse_v2_backend import (
|
||||||
DoclingParseV2DocumentBackend,
|
DoclingParseV2DocumentBackend,
|
||||||
DoclingParseV2PageBackend,
|
DoclingParseV2PageBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import BoundingBox, InputFormat
|
from docowling.datamodel.base_models import BoundingBox, InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -2,14 +2,14 @@ import json
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docowling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docowling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
SectionHeaderItem,
|
SectionHeaderItem,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE = False
|
GENERATE = False
|
||||||
|
|
||||||
|
@ -2,14 +2,14 @@ import json
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docowling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docowling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
SectionHeaderItem,
|
SectionHeaderItem,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE = False
|
GENERATE = False
|
||||||
|
|
||||||
|
@ -2,14 +2,14 @@ import json
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docowling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docowling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
SectionHeaderItem,
|
SectionHeaderItem,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE = False
|
GENERATE = False
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Test methods in module docling.backend.patent_uspto_backend.py."""
|
"""Test methods in module docowling.backend.patent_uspto_backend.py."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@ -12,14 +12,14 @@ import yaml
|
|||||||
from docling_core.types import DoclingDocument
|
from docling_core.types import DoclingDocument
|
||||||
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
||||||
|
|
||||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docowling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
SectionHeaderItem,
|
SectionHeaderItem,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE: bool = True
|
GENERATE: bool = True
|
||||||
DATA_PATH: Path = Path("./tests/data/uspto/")
|
DATA_PATH: Path = Path("./tests/data/uspto/")
|
||||||
|
@ -3,12 +3,12 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
from docling_core.types.doc import BoundingBox
|
from docling_core.types.doc import BoundingBox
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import (
|
from docowling.backend.pypdfium2_backend import (
|
||||||
PyPdfiumDocumentBackend,
|
PyPdfiumDocumentBackend,
|
||||||
PyPdfiumPageBackend,
|
PyPdfiumPageBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docowling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -2,9 +2,9 @@ import json
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE = False
|
GENERATE = False
|
||||||
|
|
||||||
|
@ -6,9 +6,9 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument
|
from docling_core.types.doc import DoclingDocument
|
||||||
|
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docowling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.document_converter import DocumentConverter
|
from docowling.document_converter import DocumentConverter
|
||||||
|
|
||||||
GENERATE = False
|
GENERATE = False
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from typer.testing import CliRunner
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
from docling.cli.main import app
|
from docowling.cli.main import app
|
||||||
|
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||||
|
|
||||||
|
@ -2,10 +2,10 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
@ -14,7 +14,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docowling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
from docowling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_valid_path():
|
def test_in_doc_from_valid_path():
|
||||||
@ -40,7 +40,7 @@ def test_in_doc_from_invalid_buf():
|
|||||||
|
|
||||||
|
|
||||||
def test_guess_format(tmp_path):
|
def test_guess_format(tmp_path):
|
||||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
"""Test docowling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||||
temp_dir = tmp_path / "test_guess_format"
|
temp_dir = tmp_path / "test_guess_format"
|
||||||
temp_dir.mkdir()
|
temp_dir.mkdir()
|
||||||
|
@ -3,10 +3,10 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docowling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||||
|
|
||||||
|
@ -3,8 +3,8 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream
|
from docowling.datamodel.base_models import ConversionStatus, DocumentStream
|
||||||
from docling.document_converter import ConversionError, DocumentConverter
|
from docowling.document_converter import ConversionError, DocumentConverter
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_path():
|
def get_pdf_path():
|
||||||
|
@ -3,9 +3,9 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docowling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -3,16 +3,16 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docowling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docowling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -8,8 +8,8 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
|
|||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
from pydantic.json import pydantic_encoder
|
from pydantic.json import pydantic_encoder
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, Page
|
from docowling.datamodel.base_models import ConversionStatus, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docowling.datamodel.document import ConversionResult
|
||||||
|
|
||||||
|
|
||||||
def levenshtein(str1: str, str2: str) -> int:
|
def levenshtein(str1: str, str2: str) -> int:
|
||||||
|
Loading…
Reference in New Issue
Block a user