diff --git a/CITATION.cff b/CITATION.cff
index 2a7ca30e..d67d6afc 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -2,14 +2,9 @@
# Visit https://bit.ly/cffinit to generate yours today!
cff-version: 1.2.0
-title: Docling
-message: 'If you use Docling, please consider citing as below.'
+title: Dockowling
+message: 'If you use Dockowling, please consider citing as below.'
type: software
authors:
- - name: Docling Team
-identifiers:
- - type: url
- value: 'https://arxiv.org/abs/2408.09869'
- description: 'arXiv:2408.09869'
-repository-code: 'https://github.com/DS4SD/docling'
+ - name: Docowling
license: MIT
diff --git a/Dockerfile b/Dockerfile
index c863f1c2..d6d64d12 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
COPY docs/examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
-RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
+RUN python -c 'from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4
diff --git a/README.md b/README.md
index 5374f1e1..a44fc7e7 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-
+
diff --git a/docling/__init__.py b/docowling/__init__.py
similarity index 100%
rename from docling/__init__.py
rename to docowling/__init__.py
diff --git a/docling/backend/__init__.py b/docowling/backend/__init__.py
similarity index 100%
rename from docling/backend/__init__.py
rename to docowling/backend/__init__.py
diff --git a/docling/backend/abstract_backend.py b/docowling/backend/abstract_backend.py
similarity index 92%
rename from docling/backend/abstract_backend.py
rename to docowling/backend/abstract_backend.py
index b47b11cd..ba0be70f 100644
--- a/docling/backend/abstract_backend.py
+++ b/docowling/backend/abstract_backend.py
@@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Set, Union
from docling_core.types.doc import DoclingDocument
if TYPE_CHECKING:
- from docling.datamodel.base_models import InputFormat
- from docling.datamodel.document import InputDocument
+ from docowling.datamodel.base_models import InputFormat
+ from docowling.datamodel.document import InputDocument
class AbstractDocumentBackend(ABC):
diff --git a/docling/backend/asciidoc_backend.py b/docowling/backend/asciidoc_backend.py
similarity index 98%
rename from docling/backend/asciidoc_backend.py
rename to docowling/backend/asciidoc_backend.py
index 829419af..3d0848fc 100644
--- a/docling/backend/asciidoc_backend.py
+++ b/docowling/backend/asciidoc_backend.py
@@ -16,9 +16,9 @@ from docling_core.types.doc import (
TableData,
)
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docowling/backend/csv_backend.py b/docowling/backend/csv_backend.py
new file mode 100644
index 00000000..9b5b07b6
--- /dev/null
+++ b/docowling/backend/csv_backend.py
@@ -0,0 +1,105 @@
+import csv
+from io import StringIO
+from pathlib import Path
+from typing import Union, Dict, Tuple, List
+
+from docling_core.types.doc import (
+ DoclingDocument,
+ DocumentOrigin,
+ GroupLabel,
+ TableData,
+ TableCell,
+)
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
+
+
+class CsvDocumentBackend(DeclarativeDocumentBackend):
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[StringIO, Path]):
+ super().__init__(in_doc, path_or_stream)
+ self.rows = []
+ try:
+ # Load the CSV data
+ if isinstance(self.path_or_stream, Path):
+ with self.path_or_stream.open(mode="r", encoding="utf-8") as file:
+ self.rows = list(csv.reader(file))
+ elif isinstance(self.path_or_stream, StringIO):
+ self.rows = list(csv.reader(self.path_or_stream))
+
+ self.valid = True
+ except Exception as e:
+ self.valid = False
+ raise RuntimeError(
+ f"CsvDocumentBackend could not load document with hash {self.document_hash}"
+ ) from e
+
+ def is_valid(self) -> bool:
+ return self.valid
+
+ @classmethod
+ def supports_pagination(cls) -> bool:
+ return False # Typically, CSV files do not support pagination.
+
+ def unload(self):
+ self.path_or_stream = None
+
+ @classmethod
+ def supported_formats(cls) -> Set[InputFormat]:
+ return {InputFormat.CSV}
+
+ def convert(self) -> DoclingDocument:
+ origin = DocumentOrigin(
+ filename=self.file.name or "file.csv",
+ mimetype="text/csv",
+ binary_hash=self.document_hash,
+ )
+ doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
+
+ if self.is_valid():
+ doc = self._convert_csv_to_document(doc)
+ else:
+ raise RuntimeError(
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+ )
+
+ return doc
+
+ def _convert_csv_to_document(self, doc: DoclingDocument) -> DoclingDocument:
+ if not self.rows:
+ return doc # No data to process
+
+ # Create a section for the CSV data
+ self.parents[0] = doc.add_group(
+ parent=None,
+ label=GroupLabel.SECTION,
+ name="CSV Data",
+ )
+
+ # Convert rows into table data
+ num_rows = len(self.rows)
+ num_cols = max(len(row) for row in self.rows)
+
+ table_data = TableData(
+ num_rows=num_rows,
+ num_cols=num_cols,
+ table_cells=[],
+ )
+
+ for row_idx, row in enumerate(self.rows):
+ for col_idx, cell in enumerate(row):
+ table_cell = TableCell(
+ text=cell,
+ row_span=1,
+ col_span=1,
+ start_row_offset_idx=row_idx,
+ end_row_offset_idx=row_idx + 1,
+ start_col_offset_idx=col_idx,
+ end_col_offset_idx=col_idx + 1,
+ col_header=False,
+ row_header=False,
+ )
+ table_data.table_cells.append(table_cell)
+
+ doc.add_table(data=table_data, parent=self.parents[0])
+ return doc
\ No newline at end of file
diff --git a/docling/backend/docling_parse_backend.py b/docowling/backend/docling_parse_backend.py
similarity index 97%
rename from docling/backend/docling_parse_backend.py
rename to docowling/backend/docling_parse_backend.py
index bb1fe058..4bc0aba3 100644
--- a/docling/backend/docling_parse_backend.py
+++ b/docowling/backend/docling_parse_backend.py
@@ -10,9 +10,9 @@ from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
-from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
-from docling.datamodel.document import InputDocument
+from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docowling.datamodel.base_models import Cell
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/docling_parse_v2_backend.py b/docowling/backend/docling_parse_v2_backend.py
similarity index 97%
rename from docling/backend/docling_parse_v2_backend.py
rename to docowling/backend/docling_parse_v2_backend.py
index 12d7df55..3eb448c0 100644
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docowling/backend/docling_parse_v2_backend.py
@@ -10,11 +10,11 @@ from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
-from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell, Size
+from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docowling.datamodel.base_models import Cell, Size
if TYPE_CHECKING:
- from docling.datamodel.document import InputDocument
+ from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/html_backend.py b/docowling/backend/html_backend.py
similarity index 98%
rename from docling/backend/html_backend.py
rename to docowling/backend/html_backend.py
index 9cd1e29b..46514c6e 100644
--- a/docling/backend/html_backend.py
+++ b/docowling/backend/html_backend.py
@@ -13,9 +13,9 @@ from docling_core.types.doc import (
TableData,
)
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/md_backend.py b/docowling/backend/md_backend.py
similarity index 98%
rename from docling/backend/md_backend.py
rename to docowling/backend/md_backend.py
index 2bcc6d7d..b7b7010c 100644
--- a/docling/backend/md_backend.py
+++ b/docowling/backend/md_backend.py
@@ -19,9 +19,9 @@ from docling_core.types.doc import (
)
from marko import Markdown
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/msexcel_backend.py b/docowling/backend/msexcel_backend.py
similarity index 98%
rename from docling/backend/msexcel_backend.py
rename to docowling/backend/msexcel_backend.py
index 508b0e8d..409ac67e 100644
--- a/docling/backend/msexcel_backend.py
+++ b/docowling/backend/msexcel_backend.py
@@ -18,9 +18,9 @@ from openpyxl.cell.cell import Cell
from openpyxl.drawing.image import Image
from openpyxl.worksheet.worksheet import Worksheet
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/mspowerpoint_backend.py b/docowling/backend/mspowerpoint_backend.py
similarity index 98%
rename from docling/backend/mspowerpoint_backend.py
rename to docowling/backend/mspowerpoint_backend.py
index f595e4bd..f28c4ac5 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docowling/backend/mspowerpoint_backend.py
@@ -20,12 +20,12 @@ from PIL import Image
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
-from docling.backend.abstract_backend import (
+from docowling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/msword_backend.py b/docowling/backend/msword_backend.py
similarity index 99%
rename from docling/backend/msword_backend.py
rename to docowling/backend/msword_backend.py
index f59356e2..f19db738 100644
--- a/docling/backend/msword_backend.py
+++ b/docowling/backend/msword_backend.py
@@ -18,9 +18,9 @@ from lxml import etree
from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/pdf_backend.py b/docowling/backend/pdf_backend.py
similarity index 90%
rename from docling/backend/pdf_backend.py
rename to docowling/backend/pdf_backend.py
index cd7a0815..ad9b6621 100644
--- a/docling/backend/pdf_backend.py
+++ b/docowling/backend/pdf_backend.py
@@ -6,9 +6,9 @@ from typing import Iterable, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from PIL import Image
-from docling.backend.abstract_backend import PaginatedDocumentBackend
-from docling.datamodel.base_models import Cell, InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import PaginatedDocumentBackend
+from docowling.datamodel.base_models import Cell, InputFormat
+from docowling.datamodel.document import InputDocument
class PdfPageBackend(ABC):
diff --git a/docling/backend/pypdfium2_backend.py b/docowling/backend/pypdfium2_backend.py
similarity index 97%
rename from docling/backend/pypdfium2_backend.py
rename to docowling/backend/pypdfium2_backend.py
index d24ba608..cee1adeb 100644
--- a/docling/backend/pypdfium2_backend.py
+++ b/docowling/backend/pypdfium2_backend.py
@@ -11,11 +11,11 @@ from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
-from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
+from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docowling.datamodel.base_models import Cell
if TYPE_CHECKING:
- from docling.datamodel.document import InputDocument
+ from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/xml/__init__.py b/docowling/backend/xml/__init__.py
similarity index 100%
rename from docling/backend/xml/__init__.py
rename to docowling/backend/xml/__init__.py
diff --git a/docling/backend/xml/pubmed_backend.py b/docowling/backend/xml/pubmed_backend.py
old mode 100755
new mode 100644
similarity index 99%
rename from docling/backend/xml/pubmed_backend.py
rename to docowling/backend/xml/pubmed_backend.py
index acbcd4e1..b5d2bdb6
--- a/docling/backend/xml/pubmed_backend.py
+++ b/docowling/backend/xml/pubmed_backend.py
@@ -16,9 +16,9 @@ from docling_core.types.doc import (
from lxml import etree
from typing_extensions import TypedDict, override
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/backend/xml/uspto_backend.py b/docowling/backend/xml/uspto_backend.py
similarity index 99%
rename from docling/backend/xml/uspto_backend.py
rename to docowling/backend/xml/uspto_backend.py
index ef253b21..14c2604b 100644
--- a/docling/backend/xml/uspto_backend.py
+++ b/docowling/backend/xml/uspto_backend.py
@@ -30,9 +30,9 @@ from docling_core.types.doc.document import LevelNumber
from pydantic import NonNegativeInt
from typing_extensions import Self, TypedDict, override
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.abstract_backend import DeclarativeDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docling/chunking/__init__.py b/docowling/chunking/__init__.py
similarity index 100%
rename from docling/chunking/__init__.py
rename to docowling/chunking/__init__.py
diff --git a/docling/cli/__init__.py b/docowling/cli/__init__.py
similarity index 100%
rename from docling/cli/__init__.py
rename to docowling/cli/__init__.py
diff --git a/docling/cli/main.py b/docowling/cli/main.py
similarity index 96%
rename from docling/cli/main.py
rename to docowling/cli/main.py
index a83aecbf..05f90906 100644
--- a/docling/cli/main.py
+++ b/docowling/cli/main.py
@@ -14,18 +14,18 @@ from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter, ValidationError
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import (
+from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docowling.backend.pdf_backend import PdfDocumentBackend
+from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docowling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
InputFormat,
OutputFormat,
)
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
@@ -39,8 +39,8 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
-from docling.datamodel.settings import settings
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docowling.datamodel.settings import settings
+from docowling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
diff --git a/docling/datamodel/__init__.py b/docowling/datamodel/__init__.py
similarity index 100%
rename from docling/datamodel/__init__.py
rename to docowling/datamodel/__init__.py
diff --git a/docling/datamodel/base_models.py b/docowling/datamodel/base_models.py
similarity index 97%
rename from docling/datamodel/base_models.py
rename to docowling/datamodel/base_models.py
index 5bd28ed6..a904bd1b 100644
--- a/docling/datamodel/base_models.py
+++ b/docowling/datamodel/base_models.py
@@ -15,7 +15,7 @@ from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
if TYPE_CHECKING:
- from docling.backend.pdf_backend import PdfPageBackend
+ from docowling.backend.pdf_backend import PdfPageBackend
class ConversionStatus(str, Enum):
@@ -39,6 +39,7 @@ class InputFormat(str, Enum):
ASCIIDOC = "asciidoc"
MD = "md"
XLSX = "xlsx"
+ CSV = "csv"
XML_USPTO = "xml_uspto"
@@ -60,6 +61,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
+ InputFormat.CSV: ["csv"],
InputFormat.XML_USPTO: ["xml", "txt"],
}
@@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
+ InputFormat.CSV: ["text/csv"],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
}
diff --git a/docling/datamodel/document.py b/docowling/datamodel/document.py
similarity index 97%
rename from docling/datamodel/document.py
rename to docowling/datamodel/document.py
index 4ed7d577..2e3f7b85 100644
--- a/docling/datamodel/document.py
+++ b/docowling/datamodel/document.py
@@ -47,11 +47,11 @@ from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
from typing_extensions import deprecated
-from docling.backend.abstract_backend import (
+from docowling.backend.abstract_backend import (
AbstractDocumentBackend,
PaginatedDocumentBackend,
)
-from docling.datamodel.base_models import (
+from docowling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
DocumentStream,
@@ -62,12 +62,12 @@ from docling.datamodel.base_models import (
MimeTypeToFormat,
Page,
)
-from docling.datamodel.settings import DocumentLimits
-from docling.utils.profiling import ProfilingItem
-from docling.utils.utils import create_file_hash, create_hash
+from docowling.datamodel.settings import DocumentLimits
+from docowling.utils.profiling import ProfilingItem
+from docowling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING:
- from docling.document_converter import FormatOption
+ from docowling.document_converter import FormatOption
_log = logging.getLogger(__name__)
diff --git a/docling/datamodel/pipeline_options.py b/docowling/datamodel/pipeline_options.py
similarity index 100%
rename from docling/datamodel/pipeline_options.py
rename to docowling/datamodel/pipeline_options.py
diff --git a/docling/datamodel/settings.py b/docowling/datamodel/settings.py
similarity index 100%
rename from docling/datamodel/settings.py
rename to docowling/datamodel/settings.py
diff --git a/docling/document_converter.py b/docowling/document_converter.py
similarity index 87%
rename from docling/document_converter.py
rename to docowling/document_converter.py
index c9cbedd2..4106be79 100644
--- a/docling/document_converter.py
+++ b/docowling/document_converter.py
@@ -7,35 +7,36 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
-from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.backend.asciidoc_backend import AsciiDocBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.html_backend import HTMLDocumentBackend
-from docling.backend.md_backend import MarkdownDocumentBackend
-from docling.backend.msexcel_backend import MsExcelDocumentBackend
-from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
-from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
-from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
-from docling.datamodel.base_models import (
+from docowling.backend.abstract_backend import AbstractDocumentBackend
+from docowling.backend.asciidoc_backend import AsciiDocBackend
+from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docowling.backend.html_backend import HTMLDocumentBackend
+from docowling.backend.md_backend import MarkdownDocumentBackend
+from docowling.backend.msexcel_backend import MsExcelDocumentBackend
+from docowling.backend.csv_backend import CsvDocumentBackend
+from docowling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
+from docowling.backend.msword_backend import MsWordDocumentBackend
+from docowling.backend.xml.pubmed_backend import PubMedDocumentBackend
+from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
+from docowling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
DocumentStream,
ErrorItem,
InputFormat,
)
-from docling.datamodel.document import (
+from docowling.datamodel.document import (
ConversionResult,
InputDocument,
_DocumentConversionInput,
)
-from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import DocumentLimits, settings
-from docling.exceptions import ConversionError
-from docling.pipeline.base_pipeline import BasePipeline
-from docling.pipeline.simple_pipeline import SimplePipeline
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-from docling.utils.utils import chunkify
+from docowling.datamodel.pipeline_options import PipelineOptions
+from docowling.datamodel.settings import DocumentLimits, settings
+from docowling.exceptions import ConversionError
+from docowling.pipeline.base_pipeline import BasePipeline
+from docowling.pipeline.simple_pipeline import SimplePipeline
+from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docowling.utils.utils import chunkify
_log = logging.getLogger(__name__)
@@ -58,6 +59,9 @@ class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
+class CsvFormatOption(FormatOption):
+ pipeline_cls: Type = SimplePipeline
+ backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
@@ -109,6 +113,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
),
+ InputFormat.CSV: FormatOption(
+ pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
+ ),
InputFormat.DOCX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
),
diff --git a/docling/exceptions.py b/docowling/exceptions.py
similarity index 100%
rename from docling/exceptions.py
rename to docowling/exceptions.py
diff --git a/docling/models/__init__.py b/docowling/models/__init__.py
similarity index 100%
rename from docling/models/__init__.py
rename to docowling/models/__init__.py
diff --git a/docling/models/base_model.py b/docowling/models/base_model.py
similarity index 84%
rename from docling/models/base_model.py
rename to docowling/models/base_model.py
index 1147896c..f6ae9fa0 100644
--- a/docling/models/base_model.py
+++ b/docowling/models/base_model.py
@@ -3,8 +3,8 @@ from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
-from docling.datamodel.base_models import Page
-from docling.datamodel.document import ConversionResult
+from docowling.datamodel.base_models import Page
+from docowling.datamodel.document import ConversionResult
class BasePageModel(ABC):
diff --git a/docling/models/base_ocr_model.py b/docowling/models/base_ocr_model.py
similarity index 95%
rename from docling/models/base_ocr_model.py
rename to docowling/models/base_ocr_model.py
index 38b5e52c..3b96f7b2 100644
--- a/docling/models/base_ocr_model.py
+++ b/docowling/models/base_ocr_model.py
@@ -10,11 +10,11 @@ from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label
-from docling.datamodel.base_models import Cell, OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import OcrOptions
-from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
+from docowling.datamodel.base_models import Cell, OcrCell, Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import OcrOptions
+from docowling.datamodel.settings import settings
+from docowling.models.base_model import BasePageModel
_log = logging.getLogger(__name__)
diff --git a/docling/models/ds_glm_model.py b/docowling/models/ds_glm_model.py
similarity index 97%
rename from docling/models/ds_glm_model.py
rename to docowling/models/ds_glm_model.py
index 6f7de07a..0c04c7a1 100644
--- a/docling/models/ds_glm_model.py
+++ b/docowling/models/ds_glm_model.py
@@ -24,18 +24,18 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict, TypeAdapter
-from docling.datamodel.base_models import (
+from docowling.datamodel.base_models import (
Cluster,
ContainerElement,
FigureElement,
Table,
TextElement,
)
-from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
-from docling.datamodel.settings import settings
-from docling.utils.glm_utils import to_docling_document
-from docling.utils.profiling import ProfilingScope, TimeRecorder
-from docling.utils.utils import create_hash
+from docowling.datamodel.document import ConversionResult, layout_label_to_ds_type
+from docowling.datamodel.settings import settings
+from docowling.utils.glm_utils import to_docling_document
+from docowling.utils.profiling import ProfilingScope, TimeRecorder
+from docowling.utils.utils import create_hash
class GlmOptions(BaseModel):
diff --git a/docling/models/easyocr_model.py b/docowling/models/easyocr_model.py
similarity index 92%
rename from docling/models/easyocr_model.py
rename to docowling/models/easyocr_model.py
index bbe4fb05..74f70aed 100644
--- a/docling/models/easyocr_model.py
+++ b/docowling/models/easyocr_model.py
@@ -6,17 +6,17 @@ import numpy
import torch
from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import Cell, OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+from docowling.datamodel.base_models import Cell, OcrCell, Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
)
-from docling.datamodel.settings import settings
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.settings import settings
+from docowling.models.base_ocr_model import BaseOcrModel
+from docowling.utils.accelerator_utils import decide_device
+from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/models/layout_model.py b/docowling/models/layout_model.py
similarity index 95%
rename from docling/models/layout_model.py
rename to docowling/models/layout_model.py
index 014cddd3..96f46496 100644
--- a/docling/models/layout_model.py
+++ b/docowling/models/layout_model.py
@@ -9,20 +9,20 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import Image, ImageDraw, ImageFont
-from docling.datamodel.base_models import (
+from docowling.datamodel.base_models import (
BoundingBox,
Cell,
Cluster,
LayoutPrediction,
Page,
)
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
-from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.layout_postprocessor import LayoutPostprocessor
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
+from docowling.datamodel.settings import settings
+from docowling.models.base_model import BasePageModel
+from docowling.utils.accelerator_utils import decide_device
+from docowling.utils.layout_postprocessor import LayoutPostprocessor
+from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/models/ocr_mac_model.py b/docowling/models/ocr_mac_model.py
similarity index 92%
rename from docling/models/ocr_mac_model.py
rename to docowling/models/ocr_mac_model.py
index 38bcf1ca..965c9346 100644
--- a/docling/models/ocr_mac_model.py
+++ b/docowling/models/ocr_mac_model.py
@@ -4,12 +4,12 @@ from typing import Iterable, Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import OcrMacOptions
-from docling.datamodel.settings import settings
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.base_models import OcrCell, Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import OcrMacOptions
+from docowling.datamodel.settings import settings
+from docowling.models.base_ocr_model import BaseOcrModel
+from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/models/page_assemble_model.py b/docowling/models/page_assemble_model.py
similarity index 96%
rename from docling/models/page_assemble_model.py
rename to docowling/models/page_assemble_model.py
index 3e202e20..42ced247 100644
--- a/docling/models/page_assemble_model.py
+++ b/docowling/models/page_assemble_model.py
@@ -4,7 +4,7 @@ from typing import Iterable, List
from pydantic import BaseModel
-from docling.datamodel.base_models import (
+from docowling.datamodel.base_models import (
AssembledUnit,
ContainerElement,
FigureElement,
@@ -13,10 +13,10 @@ from docling.datamodel.base_models import (
Table,
TextElement,
)
-from docling.datamodel.document import ConversionResult
-from docling.models.base_model import BasePageModel
-from docling.models.layout_model import LayoutModel
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.document import ConversionResult
+from docowling.models.base_model import BasePageModel
+from docowling.models.layout_model import LayoutModel
+from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/models/page_preprocessing_model.py b/docowling/models/page_preprocessing_model.py
similarity index 90%
rename from docling/models/page_preprocessing_model.py
rename to docowling/models/page_preprocessing_model.py
index 63f1a4f6..54dc51e0 100644
--- a/docling/models/page_preprocessing_model.py
+++ b/docowling/models/page_preprocessing_model.py
@@ -4,11 +4,11 @@ from typing import Iterable, Optional
from PIL import ImageDraw
from pydantic import BaseModel
-from docling.datamodel.base_models import Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.base_models import Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.settings import settings
+from docowling.models.base_model import BasePageModel
+from docowling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):
diff --git a/docling/models/rapid_ocr_model.py b/docowling/models/rapid_ocr_model.py
similarity index 92%
rename from docling/models/rapid_ocr_model.py
rename to docowling/models/rapid_ocr_model.py
index 5882ffc7..c421da05 100644
--- a/docling/models/rapid_ocr_model.py
+++ b/docowling/models/rapid_ocr_model.py
@@ -4,17 +4,17 @@ from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+from docowling.datamodel.base_models import OcrCell, Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
RapidOcrOptions,
)
-from docling.datamodel.settings import settings
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.settings import settings
+from docowling.models.base_ocr_model import BaseOcrModel
+from docowling.utils.accelerator_utils import decide_device
+from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/models/table_structure_model.py b/docowling/models/table_structure_model.py
similarity index 95%
rename from docling/models/table_structure_model.py
rename to docowling/models/table_structure_model.py
index ba306449..8225ef61 100644
--- a/docling/models/table_structure_model.py
+++ b/docowling/models/table_structure_model.py
@@ -7,18 +7,18 @@ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw
-from docling.datamodel.base_models import Page, Table, TableStructurePrediction
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+from docowling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
TableFormerMode,
TableStructureOptions,
)
-from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.settings import settings
+from docowling.models.base_model import BasePageModel
+from docowling.utils.accelerator_utils import decide_device
+from docowling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel):
diff --git a/docling/models/tesseract_ocr_cli_model.py b/docowling/models/tesseract_ocr_cli_model.py
similarity index 94%
rename from docling/models/tesseract_ocr_cli_model.py
rename to docowling/models/tesseract_ocr_cli_model.py
index 16e1629d..9c981491 100644
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docowling/models/tesseract_ocr_cli_model.py
@@ -9,12 +9,12 @@ from typing import Iterable, Optional, Tuple
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import Cell, OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import TesseractCliOcrOptions
-from docling.datamodel.settings import settings
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.base_models import Cell, OcrCell, Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docowling.datamodel.settings import settings
+from docowling.models.base_ocr_model import BaseOcrModel
+from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/models/tesseract_ocr_model.py b/docowling/models/tesseract_ocr_model.py
similarity index 94%
rename from docling/models/tesseract_ocr_model.py
rename to docowling/models/tesseract_ocr_model.py
index b2bd358b..862ce238 100644
--- a/docling/models/tesseract_ocr_model.py
+++ b/docowling/models/tesseract_ocr_model.py
@@ -3,12 +3,12 @@ from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import Cell, OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import TesseractOcrOptions
-from docling.datamodel.settings import settings
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.profiling import TimeRecorder
+from docowling.datamodel.base_models import Cell, OcrCell, Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import TesseractOcrOptions
+from docowling.datamodel.settings import settings
+from docowling.models.base_ocr_model import BaseOcrModel
+from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/pipeline/__init__.py b/docowling/pipeline/__init__.py
similarity index 100%
rename from docling/pipeline/__init__.py
rename to docowling/pipeline/__init__.py
diff --git a/docling/pipeline/base_pipeline.py b/docowling/pipeline/base_pipeline.py
similarity index 93%
rename from docling/pipeline/base_pipeline.py
rename to docowling/pipeline/base_pipeline.py
index c75faaec..b9292c5b 100644
--- a/docling/pipeline/base_pipeline.py
+++ b/docowling/pipeline/base_pipeline.py
@@ -7,20 +7,20 @@ from typing import Callable, Iterable, List
from docling_core.types.doc import DoclingDocument, NodeItem
-from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import (
+from docowling.backend.abstract_backend import AbstractDocumentBackend
+from docowling.backend.pdf_backend import PdfDocumentBackend
+from docowling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
-from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import settings
-from docling.models.base_model import BaseEnrichmentModel
-from docling.utils.profiling import ProfilingScope, TimeRecorder
-from docling.utils.utils import chunkify
+from docowling.datamodel.document import ConversionResult, InputDocument
+from docowling.datamodel.pipeline_options import PipelineOptions
+from docowling.datamodel.settings import settings
+from docowling.models.base_model import BaseEnrichmentModel
+from docowling.utils.profiling import ProfilingScope, TimeRecorder
+from docowling.utils.utils import chunkify
_log = logging.getLogger(__name__)
diff --git a/docling/pipeline/simple_pipeline.py b/docowling/pipeline/simple_pipeline.py
similarity index 84%
rename from docling/pipeline/simple_pipeline.py
rename to docowling/pipeline/simple_pipeline.py
index fb985231..98b97372 100644
--- a/docling/pipeline/simple_pipeline.py
+++ b/docowling/pipeline/simple_pipeline.py
@@ -1,14 +1,14 @@
import logging
-from docling.backend.abstract_backend import (
+from docowling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import PipelineOptions
-from docling.pipeline.base_pipeline import BasePipeline
-from docling.utils.profiling import ProfilingScope, TimeRecorder
+from docowling.datamodel.base_models import ConversionStatus
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import PipelineOptions
+from docowling.pipeline.base_pipeline import BasePipeline
+from docowling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docowling/pipeline/standard_pdf_pipeline.py
similarity index 87%
rename from docling/pipeline/standard_pdf_pipeline.py
rename to docowling/pipeline/standard_pdf_pipeline.py
index 2f8c1421..f72de68b 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docowling/pipeline/standard_pdf_pipeline.py
@@ -5,11 +5,11 @@ from typing import Optional
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
-from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+from docowling.backend.abstract_backend import AbstractDocumentBackend
+from docowling.backend.pdf_backend import PdfDocumentBackend
+from docowling.datamodel.base_models import AssembledUnit, Page
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
@@ -17,22 +17,22 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.models.ds_glm_model import GlmModel, GlmOptions
-from docling.models.easyocr_model import EasyOcrModel
-from docling.models.layout_model import LayoutModel
-from docling.models.ocr_mac_model import OcrMacModel
-from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
-from docling.models.page_preprocessing_model import (
+from docowling.models.base_ocr_model import BaseOcrModel
+from docowling.models.ds_glm_model import GlmModel, GlmOptions
+from docowling.models.easyocr_model import EasyOcrModel
+from docowling.models.layout_model import LayoutModel
+from docowling.models.ocr_mac_model import OcrMacModel
+from docowling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
+from docowling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
-from docling.models.rapid_ocr_model import RapidOcrModel
-from docling.models.table_structure_model import TableStructureModel
-from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
-from docling.models.tesseract_ocr_model import TesseractOcrModel
-from docling.pipeline.base_pipeline import PaginatedPipeline
-from docling.utils.profiling import ProfilingScope, TimeRecorder
+from docowling.models.rapid_ocr_model import RapidOcrModel
+from docowling.models.table_structure_model import TableStructureModel
+from docowling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
+from docowling.models.tesseract_ocr_model import TesseractOcrModel
+from docowling.pipeline.base_pipeline import PaginatedPipeline
+from docowling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
diff --git a/docling/py.typed b/docowling/py.typed
similarity index 100%
rename from docling/py.typed
rename to docowling/py.typed
diff --git a/docling/utils/__init__.py b/docowling/utils/__init__.py
similarity index 100%
rename from docling/utils/__init__.py
rename to docowling/utils/__init__.py
diff --git a/docling/utils/accelerator_utils.py b/docowling/utils/accelerator_utils.py
similarity index 95%
rename from docling/utils/accelerator_utils.py
rename to docowling/utils/accelerator_utils.py
index 59b04796..572a37b8 100644
--- a/docling/utils/accelerator_utils.py
+++ b/docowling/utils/accelerator_utils.py
@@ -2,7 +2,7 @@ import logging
import torch
-from docling.datamodel.pipeline_options import AcceleratorDevice
+from docowling.datamodel.pipeline_options import AcceleratorDevice
_log = logging.getLogger(__name__)
diff --git a/docling/utils/export.py b/docowling/utils/export.py
similarity index 97%
rename from docling/utils/export.py
rename to docowling/utils/export.py
index 5b022f4a..98d7e302 100644
--- a/docling/utils/export.py
+++ b/docowling/utils/export.py
@@ -4,8 +4,8 @@ from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
-from docling.datamodel.base_models import OcrCell
-from docling.datamodel.document import ConversionResult, Page
+from docowling.datamodel.base_models import OcrCell
+from docowling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)
diff --git a/docling/utils/glm_utils.py b/docowling/utils/glm_utils.py
similarity index 100%
rename from docling/utils/glm_utils.py
rename to docowling/utils/glm_utils.py
diff --git a/docling/utils/layout_postprocessor.py b/docowling/utils/layout_postprocessor.py
similarity index 99%
rename from docling/utils/layout_postprocessor.py
rename to docowling/utils/layout_postprocessor.py
index 8cb6bc55..20e46613 100644
--- a/docling/utils/layout_postprocessor.py
+++ b/docowling/utils/layout_postprocessor.py
@@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
from docling_core.types.doc import DocItemLabel, Size
from rtree import index
-from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
+from docowling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
_log = logging.getLogger(__name__)
diff --git a/docling/utils/profiling.py b/docowling/utils/profiling.py
similarity index 93%
rename from docling/utils/profiling.py
rename to docowling/utils/profiling.py
index 0d09f17d..1b350fca 100644
--- a/docling/utils/profiling.py
+++ b/docowling/utils/profiling.py
@@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, List
import numpy as np
from pydantic import BaseModel
-from docling.datamodel.settings import settings
+from docowling.datamodel.settings import settings
if TYPE_CHECKING:
- from docling.datamodel.document import ConversionResult
+ from docowling.datamodel.document import ConversionResult
class ProfilingScope(str, Enum):
diff --git a/docling/utils/utils.py b/docowling/utils/utils.py
similarity index 100%
rename from docling/utils/utils.py
rename to docowling/utils/utils.py
diff --git a/docs/concepts/chunking.md b/docs/concepts/chunking.md
index bed8bce3..6a17cc6d 100644
--- a/docs/concepts/chunking.md
+++ b/docs/concepts/chunking.md
@@ -28,7 +28,7 @@ The `BaseChunker` base class API defines that any chunker should provide the fol
- If you are using the `docling` package, you can import as follows:
```python
- from docling.chunking import HybridChunker
+ from docowling.chunking import HybridChunker
```
- If you are only using the `docling-core` package, you must ensure to install
the `chunking` extra, e.g.
diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py
index f6ad92bd..b654664b 100644
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@@ -6,10 +6,10 @@ from typing import Iterable
import yaml
-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.settings import settings
-from docling.document_converter import DocumentConverter
+from docowling.datamodel.base_models import ConversionStatus
+from docowling.datamodel.document import ConversionResult
+from docowling.datamodel.settings import settings
+from docowling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py
index a7efa975..dd787b33 100644
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@@ -3,13 +3,13 @@ import logging
import time
from pathlib import Path
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.models.ocr_mac_model import OcrMacOptions
-from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
-from docling.models.tesseract_ocr_model import TesseractOcrOptions
+from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.pipeline_options import PdfPipelineOptions
+from docowling.document_converter import DocumentConverter, PdfFormatOption
+from docowling.models.ocr_mac_model import OcrMacOptions
+from docowling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
+from docowling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__)
diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py
index 7ad06e4a..70e95ed4 100644
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@@ -10,11 +10,11 @@ from docling_core.types.doc import (
PictureItem,
)
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.models.base_model import BaseEnrichmentModel
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.pipeline_options import PdfPipelineOptions
+from docowling.document_converter import DocumentConverter, PdfFormatOption
+from docowling.models.base_model import BaseEnrichmentModel
+from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py
index b2ecc43f..2fe3cb93 100644
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@@ -4,9 +4,9 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
-from docling.datamodel.base_models import FigureElement, InputFormat, Table
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docowling.datamodel.base_models import FigureElement, InputFormat, Table
+from docowling.datamodel.pipeline_options import PdfPipelineOptions
+from docowling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py
index 09885bd3..4b1e2f17 100644
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@@ -5,11 +5,11 @@ from pathlib import Path
import pandas as pd
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.utils.export import generate_multimodal_pages
-from docling.utils.utils import create_hash
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.pipeline_options import PdfPipelineOptions
+from docowling.document_converter import DocumentConverter, PdfFormatOption
+from docowling.utils.export import generate_multimodal_pages
+from docowling.utils.utils import create_hash
_log = logging.getLogger(__name__)
diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py
index 68b9ce47..96ac7f95 100644
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@@ -4,7 +4,7 @@ from pathlib import Path
import pandas as pd
-from docling.document_converter import DocumentConverter
+from docowling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py
index 967910dc..9717844b 100644
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@@ -1,8 +1,8 @@
from pathlib import Path
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import (
+from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
@@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docowling.document_converter import DocumentConverter, PdfFormatOption
def main():
diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb
index 6f097a8f..7d4e798d 100644
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@@ -37,7 +37,7 @@
"metadata": {},
"outputs": [],
"source": [
- "from docling.document_converter import DocumentConverter\n",
+ "from docowling.document_converter import DocumentConverter\n",
"\n",
"DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n",
"\n",
@@ -68,7 +68,7 @@
"source": [
"from transformers import AutoTokenizer\n",
"\n",
- "from docling.chunking import HybridChunker\n",
+ "from docowling.chunking import HybridChunker\n",
"\n",
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
"MAX_TOKENS = 64\n",
@@ -404,7 +404,7 @@
" return tbl\n",
"\n",
"\n",
- "db_uri = str(Path(mkdtemp()) / \"docling.db\")\n",
+ "db_uri = str(Path(mkdtemp()) / \"docowling.db\")\n",
"index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n",
"\n",
"sample_query = \"invent\"\n",
diff --git a/docs/examples/hybrid_rag_qdrant.ipynb b/docs/examples/hybrid_rag_qdrant.ipynb
index bbc8e575..4314eeda 100644
--- a/docs/examples/hybrid_rag_qdrant.ipynb
+++ b/docs/examples/hybrid_rag_qdrant.ipynb
@@ -81,8 +81,8 @@
"from docling_core.transforms.chunker import HierarchicalChunker\n",
"from qdrant_client import QdrantClient\n",
"\n",
- "from docling.datamodel.base_models import InputFormat\n",
- "from docling.document_converter import DocumentConverter"
+ "from docowling.datamodel.base_models import InputFormat\n",
+ "from docowling.document_converter import DocumentConverter"
]
},
{
diff --git a/docs/examples/minimal.py b/docs/examples/minimal.py
index 66bd2c85..38e6dcf6 100644
--- a/docs/examples/minimal.py
+++ b/docs/examples/minimal.py
@@ -1,4 +1,4 @@
-from docling.document_converter import DocumentConverter
+from docowling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
converter = DocumentConverter()
diff --git a/docs/examples/rag_haystack.ipynb b/docs/examples/rag_haystack.ipynb
index e2dc380d..e88559eb 100644
--- a/docs/examples/rag_haystack.ipynb
+++ b/docs/examples/rag_haystack.ipynb
@@ -110,7 +110,7 @@
"EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
"QUESTION = \"Which are the main AI models in Docling?\"\n",
"TOP_K = 3\n",
- "MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")"
+ "MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")"
]
},
{
@@ -168,7 +168,7 @@
"from haystack.components.writers import DocumentWriter\n",
"from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n",
"\n",
- "from docling.chunking import HybridChunker\n",
+ "from docowling.chunking import HybridChunker\n",
"\n",
"document_store = MilvusDocumentStore(\n",
" connection_args={\"uri\": MILVUS_URI},\n",
@@ -329,7 +329,7 @@
}
],
"source": [
- "from docling.chunking import DocChunk\n",
+ "from docowling.chunking import DocChunk\n",
"\n",
"print(f\"Question:\\n{QUESTION}\\n\")\n",
"print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n",
diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb
index 31ff009a..ea1ff330 100644
--- a/docs/examples/rag_langchain.ipynb
+++ b/docs/examples/rag_langchain.ipynb
@@ -83,7 +83,7 @@
"from langchain_core.document_loaders import BaseLoader\n",
"from langchain_core.documents import Document as LCDocument\n",
"\n",
- "from docling.document_converter import DocumentConverter\n",
+ "from docowling.document_converter import DocumentConverter\n",
"\n",
"class DoclingPDFLoader(BaseLoader):\n",
"\n",
diff --git a/docs/examples/rag_llamaindex.ipynb b/docs/examples/rag_llamaindex.ipynb
index 0252bc4f..9aa562ca 100644
--- a/docs/examples/rag_llamaindex.ipynb
+++ b/docs/examples/rag_llamaindex.ipynb
@@ -117,7 +117,7 @@
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
"\n",
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
- "MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n",
+ "MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")\n",
"GEN_MODEL = HuggingFaceInferenceAPI(\n",
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
@@ -182,7 +182,7 @@
"node_parser = MarkdownNodeParser()\n",
"\n",
"vector_store = MilvusVectorStore(\n",
- " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
+ " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
@@ -282,7 +282,7 @@
"node_parser = DoclingNodeParser()\n",
"\n",
"vector_store = MilvusVectorStore(\n",
- " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
+ " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
@@ -423,7 +423,7 @@
")\n",
"\n",
"vector_store = MilvusVectorStore(\n",
- " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
+ " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb
index 7f897d9e..b9f48771 100644
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@@ -207,8 +207,8 @@
}
],
"source": [
- "from docling.datamodel.document import ConversionResult\n",
- "from docling.document_converter import DocumentConverter\n",
+ "from docowling.datamodel.document import ConversionResult\n",
+ "from docowling.document_converter import DocumentConverter\n",
"\n",
"# Instantiate the doc converter\n",
"doc_converter = DocumentConverter()\n",
diff --git a/docs/examples/run_md.py b/docs/examples/run_md.py
index 46be97e2..aa286dc3 100644
--- a/docs/examples/run_md.py
+++ b/docs/examples/run_md.py
@@ -5,9 +5,9 @@ from pathlib import Path
import yaml
-from docling.backend.md_backend import MarkdownDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docowling.backend.md_backend import MarkdownDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
diff --git a/docs/examples/run_with_accelerator.py b/docs/examples/run_with_accelerator.py
index 5985401d..9665313f 100644
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@@ -1,16 +1,16 @@
from pathlib import Path
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import (
+from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
-from docling.datamodel.settings import settings
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docowling.datamodel.settings import settings
+from docowling.document_converter import DocumentConverter, PdfFormatOption
def main():
diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py
index 7bd27de5..72d8ba5e 100644
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@@ -4,15 +4,15 @@ from pathlib import Path
import yaml
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.document_converter import (
+from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docowling.datamodel.base_models import InputFormat
+from docowling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
-from docling.pipeline.simple_pipeline import SimplePipeline
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docowling.pipeline.simple_pipeline import SimplePipeline
+from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)
diff --git a/docs/faq.md b/docs/faq.md
index 8c8e4793..529f7b93 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -140,7 +140,7 @@ This is a collection of FAQ collected from the user questions on int: