MO-01 - Adding CSV backend support

This commit is contained in:
matheus 2024-12-28 14:14:46 -03:00
parent 447802b5d1
commit 4e17a51cf6
100 changed files with 460 additions and 350 deletions

View File

@ -2,14 +2,9 @@
# Visit https://bit.ly/cffinit to generate yours today! # Visit https://bit.ly/cffinit to generate yours today!
cff-version: 1.2.0 cff-version: 1.2.0
title: Docling title: Dockowling
message: 'If you use Docling, please consider citing as below.' message: 'If you use Dockowling, please consider citing as below.'
type: software type: software
authors: authors:
- name: Docling Team - name: Docowling
identifiers:
- type: url
value: 'https://arxiv.org/abs/2408.09869'
description: 'arXiv:2408.09869'
repository-code: 'https://github.com/DS4SD/docling'
license: MIT license: MIT

View File

@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
COPY docs/examples/minimal.py /root/minimal.py COPY docs/examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);' RUN python -c 'from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion. # On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4 ENV OMP_NUM_THREADS=4

View File

@ -1,5 +1,5 @@
<p align="center"> <p align="center">
<a href="https://github.com/ds4sd/docling"> <a href="https://github.com/mouraworks/docowling">
<img loading="lazy" alt="Docling" src="https://github.com/mouraworks/docowling/blob/main/docs/assets/docowling.png" width="80%"/> <img loading="lazy" alt="Docling" src="https://github.com/mouraworks/docowling/blob/main/docs/assets/docowling.png" width="80%"/>
</a> </a>
</p> </p>

View File

@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Set, Union
from docling_core.types.doc import DoclingDocument from docling_core.types.doc import DoclingDocument
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
class AbstractDocumentBackend(ABC): class AbstractDocumentBackend(ABC):

View File

@ -16,9 +16,9 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -0,0 +1,105 @@
import csv
from io import StringIO
from pathlib import Path
from typing import Union, Dict, Tuple, List
from docling_core.types.doc import (
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableData,
TableCell,
)
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
class CsvDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[StringIO, Path]):
super().__init__(in_doc, path_or_stream)
self.rows = []
try:
# Load the CSV data
if isinstance(self.path_or_stream, Path):
with self.path_or_stream.open(mode="r", encoding="utf-8") as file:
self.rows = list(csv.reader(file))
elif isinstance(self.path_or_stream, StringIO):
self.rows = list(csv.reader(self.path_or_stream))
self.valid = True
except Exception as e:
self.valid = False
raise RuntimeError(
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
) from e
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False # Typically, CSV files do not support pagination.
def unload(self):
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.CSV}
def convert(self) -> DoclingDocument:
origin = DocumentOrigin(
filename=self.file.name or "file.csv",
mimetype="text/csv",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
if self.is_valid():
doc = self._convert_csv_to_document(doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc
def _convert_csv_to_document(self, doc: DoclingDocument) -> DoclingDocument:
if not self.rows:
return doc # No data to process
# Create a section for the CSV data
self.parents[0] = doc.add_group(
parent=None,
label=GroupLabel.SECTION,
name="CSV Data",
)
# Convert rows into table data
num_rows = len(self.rows)
num_cols = max(len(row) for row in self.rows)
table_data = TableData(
num_rows=num_rows,
num_cols=num_cols,
table_cells=[],
)
for row_idx, row in enumerate(self.rows):
for col_idx, cell in enumerate(row):
table_cell = TableCell(
text=cell,
row_span=1,
col_span=1,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + 1,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + 1,
col_header=False,
row_header=False,
)
table_data.table_cells.append(table_cell)
doc.add_table(data=table_data, parent=self.parents[0])
return doc

View File

@ -10,9 +10,9 @@ from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell from docowling.datamodel.base_models import Cell
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -10,11 +10,11 @@ from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size from docowling.datamodel.base_models import Cell, Size
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -13,9 +13,9 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -19,9 +19,9 @@ from docling_core.types.doc import (
) )
from marko import Markdown from marko import Markdown
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -18,9 +18,9 @@ from openpyxl.cell.cell import Cell
from openpyxl.drawing.image import Image from openpyxl.drawing.image import Image
from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.worksheet import Worksheet
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -20,12 +20,12 @@ from PIL import Image
from pptx import Presentation from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from docling.backend.abstract_backend import ( from docowling.backend.abstract_backend import (
DeclarativeDocumentBackend, DeclarativeDocumentBackend,
PaginatedDocumentBackend, PaginatedDocumentBackend,
) )
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -18,9 +18,9 @@ from lxml import etree
from lxml.etree import XPath from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -6,9 +6,9 @@ from typing import Iterable, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc import BoundingBox, Size
from PIL import Image from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend from docowling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat from docowling.datamodel.base_models import Cell, InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
class PdfPageBackend(ABC): class PdfPageBackend(ABC):

View File

@ -11,11 +11,11 @@ from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell from docowling.datamodel.base_models import Cell
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -16,9 +16,9 @@ from docling_core.types.doc import (
from lxml import etree from lxml import etree
from typing_extensions import TypedDict, override from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -30,9 +30,9 @@ from docling_core.types.doc.document import LevelNumber
from pydantic import NonNegativeInt from pydantic import NonNegativeInt
from typing_extensions import Self, TypedDict, override from typing_extensions import Self, TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -14,18 +14,18 @@ from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter, ValidationError from pydantic import TypeAdapter, ValidationError
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docowling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ( from docowling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
FormatToExtensions, FormatToExtensions,
InputFormat, InputFormat,
OutputFormat, OutputFormat,
) )
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
EasyOcrOptions, EasyOcrOptions,
@ -39,8 +39,8 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docowling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

View File

@ -15,7 +15,7 @@ from PIL.Image import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend from docowling.backend.pdf_backend import PdfPageBackend
class ConversionStatus(str, Enum): class ConversionStatus(str, Enum):
@ -39,6 +39,7 @@ class InputFormat(str, Enum):
ASCIIDOC = "asciidoc" ASCIIDOC = "asciidoc"
MD = "md" MD = "md"
XLSX = "xlsx" XLSX = "xlsx"
CSV = "csv"
XML_USPTO = "xml_uspto" XML_USPTO = "xml_uspto"
@ -60,6 +61,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"], InputFormat.XLSX: ["xlsx"],
InputFormat.CSV: ["csv"],
InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.XML_USPTO: ["xml", "txt"],
} }
@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: [ InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
], ],
InputFormat.CSV: ["text/csv"],
InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.XML_USPTO: ["application/xml", "text/plain"],
} }

View File

@ -47,11 +47,11 @@ from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
from docling.backend.abstract_backend import ( from docowling.backend.abstract_backend import (
AbstractDocumentBackend, AbstractDocumentBackend,
PaginatedDocumentBackend, PaginatedDocumentBackend,
) )
from docling.datamodel.base_models import ( from docowling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConversionStatus, ConversionStatus,
DocumentStream, DocumentStream,
@ -62,12 +62,12 @@ from docling.datamodel.base_models import (
MimeTypeToFormat, MimeTypeToFormat,
Page, Page,
) )
from docling.datamodel.settings import DocumentLimits from docowling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem from docowling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash from docowling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.document_converter import FormatOption from docowling.document_converter import FormatOption
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -7,35 +7,36 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend from docowling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend from docowling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docowling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend from docowling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docowling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docowling.backend.csv_backend import CsvDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docowling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend from docowling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docowling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docling.datamodel.base_models import ( from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docowling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
DoclingComponentType, DoclingComponentType,
DocumentStream, DocumentStream,
ErrorItem, ErrorItem,
InputFormat, InputFormat,
) )
from docling.datamodel.document import ( from docowling.datamodel.document import (
ConversionResult, ConversionResult,
InputDocument, InputDocument,
_DocumentConversionInput, _DocumentConversionInput,
) )
from docling.datamodel.pipeline_options import PipelineOptions from docowling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings from docowling.datamodel.settings import DocumentLimits, settings
from docling.exceptions import ConversionError from docowling.exceptions import ConversionError
from docling.pipeline.base_pipeline import BasePipeline from docowling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline from docowling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.utils.utils import chunkify from docowling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -58,6 +59,9 @@ class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
class CsvFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
class WordFormatOption(FormatOption): class WordFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
@ -109,6 +113,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XLSX: FormatOption( InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
), ),
InputFormat.CSV: FormatOption(
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
),
InputFormat.DOCX: FormatOption( InputFormat.DOCX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
), ),

View File

@ -3,8 +3,8 @@ from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page from docowling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
class BasePageModel(ABC): class BasePageModel(ABC):

View File

@ -10,11 +10,11 @@ from PIL import Image, ImageDraw
from rtree import index from rtree import index
from scipy.ndimage import find_objects, label from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import Cell, OcrCell, Page from docowling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions from docowling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docowling.models.base_model import BasePageModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -24,18 +24,18 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict, TypeAdapter from pydantic import BaseModel, ConfigDict, TypeAdapter
from docling.datamodel.base_models import ( from docowling.datamodel.base_models import (
Cluster, Cluster,
ContainerElement, ContainerElement,
FigureElement, FigureElement,
Table, Table,
TextElement, TextElement,
) )
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type from docowling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.utils.glm_utils import to_docling_document from docowling.utils.glm_utils import to_docling_document
from docling.utils.profiling import ProfilingScope, TimeRecorder from docowling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash from docowling.utils.utils import create_hash
class GlmOptions(BaseModel): class GlmOptions(BaseModel):

View File

@ -6,17 +6,17 @@ import numpy
import torch import torch
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page from docowling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
EasyOcrOptions, EasyOcrOptions,
) )
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docowling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device from docowling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -9,20 +9,20 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from docling.datamodel.base_models import ( from docowling.datamodel.base_models import (
BoundingBox, BoundingBox,
Cell, Cell,
Cluster, Cluster,
LayoutPrediction, LayoutPrediction,
Page, Page,
) )
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions from docowling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docowling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device from docowling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor from docowling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -4,12 +4,12 @@ from typing import Iterable, Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docowling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrMacOptions from docowling.datamodel.pipeline_options import OcrMacOptions
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docowling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -4,7 +4,7 @@ from typing import Iterable, List
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.base_models import ( from docowling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ContainerElement, ContainerElement,
FigureElement, FigureElement,
@ -13,10 +13,10 @@ from docling.datamodel.base_models import (
Table, Table,
TextElement, TextElement,
) )
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.models.base_model import BasePageModel from docowling.models.base_model import BasePageModel
from docling.models.layout_model import LayoutModel from docowling.models.layout_model import LayoutModel
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -4,11 +4,11 @@ from typing import Iterable, Optional
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.base_models import Page from docowling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docowling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel): class PagePreprocessingOptions(BaseModel):

View File

@ -4,17 +4,17 @@ from typing import Iterable
import numpy import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page from docowling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
RapidOcrOptions, RapidOcrOptions,
) )
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docowling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device from docowling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -7,18 +7,18 @@ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docowling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
TableFormerMode, TableFormerMode,
TableStructureOptions, TableStructureOptions,
) )
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docowling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device from docowling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel): class TableStructureModel(BasePageModel):

View File

@ -9,12 +9,12 @@ from typing import Iterable, Optional, Tuple
import pandas as pd import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page from docowling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docowling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docowling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -3,12 +3,12 @@ from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page from docowling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions from docowling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docowling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -7,20 +7,20 @@ from typing import Callable, Iterable, List
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import DoclingDocument, NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docowling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docowling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import ( from docowling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
DoclingComponentType, DoclingComponentType,
ErrorItem, ErrorItem,
Page, Page,
) )
from docling.datamodel.document import ConversionResult, InputDocument from docowling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions from docowling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel from docowling.models.base_model import BaseEnrichmentModel
from docling.utils.profiling import ProfilingScope, TimeRecorder from docowling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify from docowling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -1,14 +1,14 @@
import logging import logging
from docling.backend.abstract_backend import ( from docowling.backend.abstract_backend import (
AbstractDocumentBackend, AbstractDocumentBackend,
DeclarativeDocumentBackend, DeclarativeDocumentBackend,
) )
from docling.datamodel.base_models import ConversionStatus from docowling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions from docowling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_pipeline import BasePipeline from docowling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder from docowling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -5,11 +5,11 @@ from typing import Optional
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docowling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docowling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page from docowling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions, OcrMacOptions,
PdfPipelineOptions, PdfPipelineOptions,
@ -17,22 +17,22 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.models.base_ocr_model import BaseOcrModel from docowling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions from docowling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel from docowling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docowling.models.layout_model import LayoutModel
from docling.models.ocr_mac_model import OcrMacModel from docowling.models.ocr_mac_model import OcrMacModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docowling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import ( from docowling.models.page_preprocessing_model import (
PagePreprocessingModel, PagePreprocessingModel,
PagePreprocessingOptions, PagePreprocessingOptions,
) )
from docling.models.rapid_ocr_model import RapidOcrModel from docowling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel from docowling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docowling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel from docowling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline from docowling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder from docowling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -2,7 +2,7 @@ import logging
import torch import torch
from docling.datamodel.pipeline_options import AcceleratorDevice from docowling.datamodel.pipeline_options import AcceleratorDevice
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -4,8 +4,8 @@ from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.base_models import OcrCell from docowling.datamodel.base_models import OcrCell
from docling.datamodel.document import ConversionResult, Page from docowling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
from docling_core.types.doc import DocItemLabel, Size from docling_core.types.doc import DocItemLabel, Size
from rtree import index from rtree import index
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell from docowling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, List
import numpy as np import numpy as np
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
class ProfilingScope(str, Enum): class ProfilingScope(str, Enum):

View File

@ -28,7 +28,7 @@ The `BaseChunker` base class API defines that any chunker should provide the fol
- If you are using the `docling` package, you can import as follows: - If you are using the `docling` package, you can import as follows:
```python ```python
from docling.chunking import HybridChunker from docowling.chunking import HybridChunker
``` ```
- If you are only using the `docling-core` package, you must ensure to install - If you are only using the `docling-core` package, you must ensure to install
the `chunking` extra, e.g. the `chunking` extra, e.g.

View File

@ -6,10 +6,10 @@ from typing import Iterable
import yaml import yaml
from docling.datamodel.base_models import ConversionStatus from docowling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -3,13 +3,13 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions from docowling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions from docowling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions from docowling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -10,11 +10,11 @@ from docling_core.types.doc import (
PictureItem, PictureItem,
) )
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel from docowling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions): class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):

View File

@ -4,9 +4,9 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table from docowling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -5,11 +5,11 @@ from pathlib import Path
import pandas as pd import pandas as pd
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages from docowling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash from docowling.utils.utils import create_hash
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -4,7 +4,7 @@ from pathlib import Path
import pandas as pd import pandas as pd
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -1,8 +1,8 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions, OcrMacOptions,
PdfPipelineOptions, PdfPipelineOptions,
@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():

View File

@ -37,7 +37,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from docling.document_converter import DocumentConverter\n", "from docowling.document_converter import DocumentConverter\n",
"\n", "\n",
"DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n", "DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n",
"\n", "\n",
@ -68,7 +68,7 @@
"source": [ "source": [
"from transformers import AutoTokenizer\n", "from transformers import AutoTokenizer\n",
"\n", "\n",
"from docling.chunking import HybridChunker\n", "from docowling.chunking import HybridChunker\n",
"\n", "\n",
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
"MAX_TOKENS = 64\n", "MAX_TOKENS = 64\n",
@ -404,7 +404,7 @@
" return tbl\n", " return tbl\n",
"\n", "\n",
"\n", "\n",
"db_uri = str(Path(mkdtemp()) / \"docling.db\")\n", "db_uri = str(Path(mkdtemp()) / \"docowling.db\")\n",
"index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n", "index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n",
"\n", "\n",
"sample_query = \"invent\"\n", "sample_query = \"invent\"\n",

View File

@ -81,8 +81,8 @@
"from docling_core.transforms.chunker import HierarchicalChunker\n", "from docling_core.transforms.chunker import HierarchicalChunker\n",
"from qdrant_client import QdrantClient\n", "from qdrant_client import QdrantClient\n",
"\n", "\n",
"from docling.datamodel.base_models import InputFormat\n", "from docowling.datamodel.base_models import InputFormat\n",
"from docling.document_converter import DocumentConverter" "from docowling.document_converter import DocumentConverter"
] ]
}, },
{ {

View File

@ -1,4 +1,4 @@
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
converter = DocumentConverter() converter = DocumentConverter()

View File

@ -110,7 +110,7 @@
"EXPORT_TYPE = ExportType.DOC_CHUNKS\n", "EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
"QUESTION = \"Which are the main AI models in Docling?\"\n", "QUESTION = \"Which are the main AI models in Docling?\"\n",
"TOP_K = 3\n", "TOP_K = 3\n",
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")" "MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")"
] ]
}, },
{ {
@ -168,7 +168,7 @@
"from haystack.components.writers import DocumentWriter\n", "from haystack.components.writers import DocumentWriter\n",
"from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n", "from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n",
"\n", "\n",
"from docling.chunking import HybridChunker\n", "from docowling.chunking import HybridChunker\n",
"\n", "\n",
"document_store = MilvusDocumentStore(\n", "document_store = MilvusDocumentStore(\n",
" connection_args={\"uri\": MILVUS_URI},\n", " connection_args={\"uri\": MILVUS_URI},\n",
@ -329,7 +329,7 @@
} }
], ],
"source": [ "source": [
"from docling.chunking import DocChunk\n", "from docowling.chunking import DocChunk\n",
"\n", "\n",
"print(f\"Question:\\n{QUESTION}\\n\")\n", "print(f\"Question:\\n{QUESTION}\\n\")\n",
"print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n", "print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n",

View File

@ -83,7 +83,7 @@
"from langchain_core.document_loaders import BaseLoader\n", "from langchain_core.document_loaders import BaseLoader\n",
"from langchain_core.documents import Document as LCDocument\n", "from langchain_core.documents import Document as LCDocument\n",
"\n", "\n",
"from docling.document_converter import DocumentConverter\n", "from docowling.document_converter import DocumentConverter\n",
"\n", "\n",
"class DoclingPDFLoader(BaseLoader):\n", "class DoclingPDFLoader(BaseLoader):\n",
"\n", "\n",

View File

@ -117,7 +117,7 @@
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
"\n", "\n",
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n", "EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n", "MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")\n",
"GEN_MODEL = HuggingFaceInferenceAPI(\n", "GEN_MODEL = HuggingFaceInferenceAPI(\n",
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n", " token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n", " model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
@ -182,7 +182,7 @@
"node_parser = MarkdownNodeParser()\n", "node_parser = MarkdownNodeParser()\n",
"\n", "\n",
"vector_store = MilvusVectorStore(\n", "vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n", " dim=embed_dim,\n",
" overwrite=True,\n", " overwrite=True,\n",
")\n", ")\n",
@ -282,7 +282,7 @@
"node_parser = DoclingNodeParser()\n", "node_parser = DoclingNodeParser()\n",
"\n", "\n",
"vector_store = MilvusVectorStore(\n", "vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n", " dim=embed_dim,\n",
" overwrite=True,\n", " overwrite=True,\n",
")\n", ")\n",
@ -423,7 +423,7 @@
")\n", ")\n",
"\n", "\n",
"vector_store = MilvusVectorStore(\n", "vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n", " dim=embed_dim,\n",
" overwrite=True,\n", " overwrite=True,\n",
")\n", ")\n",

View File

@ -207,8 +207,8 @@
} }
], ],
"source": [ "source": [
"from docling.datamodel.document import ConversionResult\n", "from docowling.datamodel.document import ConversionResult\n",
"from docling.document_converter import DocumentConverter\n", "from docowling.document_converter import DocumentConverter\n",
"\n", "\n",
"# Instantiate the doc converter\n", "# Instantiate the doc converter\n",
"doc_converter = DocumentConverter()\n", "doc_converter = DocumentConverter()\n",

View File

@ -5,9 +5,9 @@ from pathlib import Path
import yaml import yaml
from docling.backend.md_backend import MarkdownDocumentBackend from docowling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -1,16 +1,16 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.datamodel.settings import settings from docowling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
def main(): def main():

View File

@ -4,15 +4,15 @@ from pathlib import Path
import yaml import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.document_converter import ( from docowling.document_converter import (
DocumentConverter, DocumentConverter,
PdfFormatOption, PdfFormatOption,
WordFormatOption, WordFormatOption,
) )
from docling.pipeline.simple_pipeline import SimplePipeline from docowling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -140,7 +140,7 @@ This is a collection of FAQ collected from the user questions on <https://github
Setting the OCR language in Docling is done via the OCR pipeline options: Setting the OCR language in Docling is done via the OCR pipeline options:
```py ```py
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR

View File

@ -36,9 +36,9 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
```python ```python
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docowling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions from docowling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True pipeline_options.do_ocr = True

View File

@ -3,7 +3,7 @@
This page provides documentation for our command line tools. This page provides documentation for our command line tools.
::: mkdocs-click ::: mkdocs-click
:module: docling.cli.main :module: docowling.cli.main
:command: click_app :command: click_app
:prog_name: docling :prog_name: docling
:style: table :style: table

View File

@ -2,7 +2,7 @@
This is an automatic generated API reference of the main components of Docling. This is an automatic generated API reference of the main components of Docling.
::: docling.document_converter ::: docowling.document_converter
handler: python handler: python
options: options:
members: members:

View File

@ -8,7 +8,7 @@ can be enabled with `do_xyz = True`.
This is an automatic generated API reference of the all the pipeline options available in Docling. This is an automatic generated API reference of the all the pipeline options available in Docling.
::: docling.datamodel.pipeline_options ::: docowling.datamodel.pipeline_options
handler: python handler: python
options: options:
show_if_no_docstring: true show_if_no_docstring: true
@ -28,7 +28,7 @@ This is an automatic generated API reference of the all the pipeline options ava
signature_crossrefs: true signature_crossrefs: true
summary: true summary: true
<!-- ::: docling.document_converter.DocumentConverter <!-- ::: docowling.document_converter.DocumentConverter
handler: python handler: python
options: options:
show_if_no_docstring: true show_if_no_docstring: true

View File

@ -5,7 +5,7 @@
To convert individual PDF documents, use `convert()`, for example: To convert individual PDF documents, use `convert()`, for example:
```python ```python
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter() converter = DocumentConverter()
@ -39,9 +39,9 @@ This can improve output quality if you find that multiple columns in extracted t
```python ```python
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
pipeline_options = PdfPipelineOptions(do_table_structure=True) pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
@ -56,9 +56,9 @@ doc_converter = DocumentConverter(
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures. Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
```python ```python
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode from docowling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
pipeline_options = PdfPipelineOptions(do_table_structure=True) pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
@ -75,10 +75,10 @@ doc_converter = DocumentConverter(
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows: By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
```python ```python
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
# # to explicitly prefetch: # # to explicitly prefetch:
# artifacts_path = StandardPdfPipeline.download_models_hf() # artifacts_path = StandardPdfPipeline.download_models_hf()
@ -99,7 +99,7 @@ You can limit the file size and number of pages which should be allowed to proce
```python ```python
from pathlib import Path from pathlib import Path
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" source = "https://arxiv.org/pdf/2408.09869"
converter = DocumentConverter() converter = DocumentConverter()
@ -112,8 +112,8 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
```python ```python
from io import BytesIO from io import BytesIO
from docling.datamodel.base_models import DocumentStream from docowling.datamodel.base_models import DocumentStream
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
buf = BytesIO(your_binary_stream) buf = BytesIO(your_binary_stream)
source = DocumentStream(name="my_doc.pdf", stream=buf) source = DocumentStream(name="my_doc.pdf", stream=buf)
@ -133,8 +133,8 @@ You can chunk a Docling document using a [chunker](concepts/chunking.md), such a
[this example](examples/hybrid_chunking.ipynb)): [this example](examples/hybrid_chunking.ipynb)):
```python ```python
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
from docling.chunking import HybridChunker from docowling.chunking import HybridChunker
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062") conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
doc = conv_res.document doc = conv_res.document

View File

@ -46,17 +46,17 @@ Format options can include the pipeline class to use, the options to provide to
They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below. They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
```python ```python
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.document_converter import ( from docowling.document_converter import (
DocumentConverter, DocumentConverter,
PdfFormatOption, PdfFormatOption,
WordFormatOption, WordFormatOption,
) )
from docling.pipeline.simple_pipeline import SimplePipeline from docowling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
## Default initialization still works as before: ## Default initialization still works as before:
# doc_converter = DocumentConverter() # doc_converter = DocumentConverter()
@ -110,7 +110,7 @@ or `DocumentStream` objects, without constructing a `DocumentConversionInput` ob
```python ```python
... ...
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
## Convert a single file (from URL or local path) ## Convert a single file (from URL or local path)
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single` conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`

View File

@ -118,7 +118,7 @@ ocrmac = ["ocrmac"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"] rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts] [tool.poetry.scripts]
docling = "docling.cli.main:app" docling = "docowling.cli.main:app"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]

View File

@ -2,9 +2,9 @@ import glob
import os import os
from pathlib import Path from pathlib import Path
from docling.backend.asciidoc_backend import AsciiDocBackend from docowling.backend.asciidoc_backend import AsciiDocBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
def _get_backend(fname): def _get_backend(fname):

View File

@ -3,12 +3,12 @@ from pathlib import Path
import pytest import pytest
from docling_core.types.doc import BoundingBox from docling_core.types.doc import BoundingBox
from docling.backend.docling_parse_backend import ( from docowling.backend.docling_parse_backend import (
DoclingParseDocumentBackend, DoclingParseDocumentBackend,
DoclingParsePageBackend, DoclingParsePageBackend,
) )
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture

View File

@ -2,12 +2,12 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_v2_backend import ( from docowling.backend.docling_parse_v2_backend import (
DoclingParseV2DocumentBackend, DoclingParseV2DocumentBackend,
DoclingParseV2PageBackend, DoclingParseV2PageBackend,
) )
from docling.datamodel.base_models import BoundingBox, InputFormat from docowling.datamodel.base_models import BoundingBox, InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture

View File

@ -2,14 +2,14 @@ import json
import os import os
from pathlib import Path from pathlib import Path
from docling.backend.html_backend import HTMLDocumentBackend from docowling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docowling.datamodel.document import (
ConversionResult, ConversionResult,
InputDocument, InputDocument,
SectionHeaderItem, SectionHeaderItem,
) )
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
GENERATE = False GENERATE = False

View File

@ -2,14 +2,14 @@ import json
import os import os
from pathlib import Path from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend from docowling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docowling.datamodel.document import (
ConversionResult, ConversionResult,
InputDocument, InputDocument,
SectionHeaderItem, SectionHeaderItem,
) )
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
GENERATE = False GENERATE = False

View File

@ -2,14 +2,14 @@ import json
import os import os
from pathlib import Path from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend from docowling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docowling.datamodel.document import (
ConversionResult, ConversionResult,
InputDocument, InputDocument,
SectionHeaderItem, SectionHeaderItem,
) )
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
GENERATE = False GENERATE = False

View File

@ -1,4 +1,4 @@
"""Test methods in module docling.backend.patent_uspto_backend.py.""" """Test methods in module docowling.backend.patent_uspto_backend.py."""
import json import json
import logging import logging
@ -12,14 +12,14 @@ import yaml
from docling_core.types import DoclingDocument from docling_core.types import DoclingDocument
from docling_core.types.doc import DocItemLabel, TableData, TextItem from docling_core.types.doc import DocItemLabel, TableData, TextItem
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import ( from docowling.datamodel.document import (
ConversionResult, ConversionResult,
InputDocument, InputDocument,
SectionHeaderItem, SectionHeaderItem,
) )
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
GENERATE: bool = True GENERATE: bool = True
DATA_PATH: Path = Path("./tests/data/uspto/") DATA_PATH: Path = Path("./tests/data/uspto/")

View File

@ -3,12 +3,12 @@ from pathlib import Path
import pytest import pytest
from docling_core.types.doc import BoundingBox from docling_core.types.doc import BoundingBox
from docling.backend.pypdfium2_backend import ( from docowling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend, PyPdfiumDocumentBackend,
PyPdfiumPageBackend, PyPdfiumPageBackend,
) )
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docowling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture

View File

@ -2,9 +2,9 @@ import json
import os import os
from pathlib import Path from pathlib import Path
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
GENERATE = False GENERATE = False

View File

@ -6,9 +6,9 @@ from pathlib import Path
from docling_core.types.doc import DoclingDocument from docling_core.types.doc import DoclingDocument
from docling.datamodel.base_models import DocumentStream, InputFormat from docowling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter from docowling.document_converter import DocumentConverter
GENERATE = False GENERATE = False

View File

@ -2,7 +2,7 @@ from pathlib import Path
from typer.testing import CliRunner from typer.testing import CliRunner
from docling.cli.main import app from docowling.cli.main import app
runner = CliRunner() runner = CliRunner()

View File

@ -1,10 +1,10 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -2,10 +2,10 @@ import sys
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
OcrMacOptions, OcrMacOptions,
OcrOptions, OcrOptions,
@ -14,7 +14,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -1,9 +1,9 @@
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat from docowling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput from docowling.datamodel.document import InputDocument, _DocumentConversionInput
def test_in_doc_from_valid_path(): def test_in_doc_from_valid_path():
@ -40,7 +40,7 @@ def test_in_doc_from_invalid_buf():
def test_guess_format(tmp_path): def test_guess_format(tmp_path):
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format""" """Test docowling.datamodel.document._DocumentConversionInput.__guess_format"""
dci = _DocumentConversionInput(path_or_stream_iterator=[]) dci = _DocumentConversionInput(path_or_stream_iterator=[])
temp_dir = tmp_path / "test_guess_format" temp_dir = tmp_path / "test_guess_format"
temp_dir.mkdir() temp_dir.mkdir()

View File

@ -3,10 +3,10 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat from docowling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -3,8 +3,8 @@ from pathlib import Path
import pytest import pytest
from docling.datamodel.base_models import ConversionStatus, DocumentStream from docowling.datamodel.base_models import ConversionStatus, DocumentStream
from docling.document_converter import ConversionError, DocumentConverter from docowling.document_converter import ConversionError, DocumentConverter
def get_pdf_path(): def get_pdf_path():

View File

@ -3,9 +3,9 @@ from pathlib import Path
import pytest import pytest
from docling.datamodel.base_models import InputFormat from docowling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture @pytest.fixture

View File

@ -3,16 +3,16 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docowling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docowling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
PdfPipelineOptions, PdfPipelineOptions,
TableFormerMode, TableFormerMode,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docowling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture @pytest.fixture

View File

@ -8,8 +8,8 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
from pydantic import TypeAdapter from pydantic import TypeAdapter
from pydantic.json import pydantic_encoder from pydantic.json import pydantic_encoder
from docling.datamodel.base_models import ConversionStatus, Page from docowling.datamodel.base_models import ConversionStatus, Page
from docling.datamodel.document import ConversionResult from docowling.datamodel.document import ConversionResult
def levenshtein(str1: str, str2: str) -> int: def levenshtein(str1: str, str2: str) -> int: