diff --git a/CITATION.cff b/CITATION.cff index 2a7ca30e..d67d6afc 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,14 +2,9 @@ # Visit https://bit.ly/cffinit to generate yours today! cff-version: 1.2.0 -title: Docling -message: 'If you use Docling, please consider citing as below.' +title: Dockowling +message: 'If you use Dockowling, please consider citing as below.' type: software authors: - - name: Docling Team -identifiers: - - type: url - value: 'https://arxiv.org/abs/2408.09869' - description: 'arXiv:2408.09869' -repository-code: 'https://github.com/DS4SD/docling' + - name: Docowling license: MIT diff --git a/Dockerfile b/Dockerfile index c863f1c2..d6d64d12 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/ COPY docs/examples/minimal.py /root/minimal.py RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);' +RUN python -c 'from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);' # On container environments, always set a thread budget to avoid undesired thread congestion. ENV OMP_NUM_THREADS=4 diff --git a/README.md b/README.md index 5374f1e1..a44fc7e7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- + Docling

diff --git a/docling/__init__.py b/docowling/__init__.py similarity index 100% rename from docling/__init__.py rename to docowling/__init__.py diff --git a/docling/backend/__init__.py b/docowling/backend/__init__.py similarity index 100% rename from docling/backend/__init__.py rename to docowling/backend/__init__.py diff --git a/docling/backend/abstract_backend.py b/docowling/backend/abstract_backend.py similarity index 92% rename from docling/backend/abstract_backend.py rename to docowling/backend/abstract_backend.py index b47b11cd..ba0be70f 100644 --- a/docling/backend/abstract_backend.py +++ b/docowling/backend/abstract_backend.py @@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Set, Union from docling_core.types.doc import DoclingDocument if TYPE_CHECKING: - from docling.datamodel.base_models import InputFormat - from docling.datamodel.document import InputDocument + from docowling.datamodel.base_models import InputFormat + from docowling.datamodel.document import InputDocument class AbstractDocumentBackend(ABC): diff --git a/docling/backend/asciidoc_backend.py b/docowling/backend/asciidoc_backend.py similarity index 98% rename from docling/backend/asciidoc_backend.py rename to docowling/backend/asciidoc_backend.py index 829419af..3d0848fc 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docowling/backend/asciidoc_backend.py @@ -16,9 +16,9 @@ from docling_core.types.doc import ( TableData, ) -from docling.backend.abstract_backend import DeclarativeDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docowling/backend/csv_backend.py b/docowling/backend/csv_backend.py new file mode 100644 index 00000000..9b5b07b6 --- /dev/null +++ b/docowling/backend/csv_backend.py @@ -0,0 +1,105 @@ +import csv +from io import StringIO +from pathlib import Path +from typing import Union, Dict, Tuple, List + +from docling_core.types.doc import ( + DoclingDocument, + DocumentOrigin, + GroupLabel, + TableData, + TableCell, +) +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument + + +class CsvDocumentBackend(DeclarativeDocumentBackend): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[StringIO, Path]): + super().__init__(in_doc, path_or_stream) + self.rows = [] + try: + # Load the CSV data + if isinstance(self.path_or_stream, Path): + with self.path_or_stream.open(mode="r", encoding="utf-8") as file: + self.rows = list(csv.reader(file)) + elif isinstance(self.path_or_stream, StringIO): + self.rows = list(csv.reader(self.path_or_stream)) + + self.valid = True + except Exception as e: + self.valid = False + raise RuntimeError( + f"CsvDocumentBackend could not load document with hash {self.document_hash}" + ) from e + + def is_valid(self) -> bool: + return self.valid + + @classmethod + def supports_pagination(cls) -> bool: + return False # Typically, CSV files do not support pagination. + + def unload(self): + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.CSV} + + def convert(self) -> DoclingDocument: + origin = DocumentOrigin( + filename=self.file.name or "file.csv", + mimetype="text/csv", + binary_hash=self.document_hash, + ) + doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin) + + if self.is_valid(): + doc = self._convert_csv_to_document(doc) + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) + + return doc + + def _convert_csv_to_document(self, doc: DoclingDocument) -> DoclingDocument: + if not self.rows: + return doc # No data to process + + # Create a section for the CSV data + self.parents[0] = doc.add_group( + parent=None, + label=GroupLabel.SECTION, + name="CSV Data", + ) + + # Convert rows into table data + num_rows = len(self.rows) + num_cols = max(len(row) for row in self.rows) + + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], + ) + + for row_idx, row in enumerate(self.rows): + for col_idx, cell in enumerate(row): + table_cell = TableCell( + text=cell, + row_span=1, + col_span=1, + start_row_offset_idx=row_idx, + end_row_offset_idx=row_idx + 1, + start_col_offset_idx=col_idx, + end_col_offset_idx=col_idx + 1, + col_header=False, + row_header=False, + ) + table_data.table_cells.append(table_cell) + + doc.add_table(data=table_data, parent=self.parents[0]) + return doc \ No newline at end of file diff --git a/docling/backend/docling_parse_backend.py b/docowling/backend/docling_parse_backend.py similarity index 97% rename from docling/backend/docling_parse_backend.py rename to docowling/backend/docling_parse_backend.py index bb1fe058..4bc0aba3 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docowling/backend/docling_parse_backend.py @@ -10,9 +10,9 @@ from docling_parse.pdf_parsers import pdf_parser_v1 from PIL import Image, ImageDraw from pypdfium2 import PdfPage -from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell -from docling.datamodel.document import InputDocument +from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docowling.datamodel.base_models import Cell +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/docling_parse_v2_backend.py b/docowling/backend/docling_parse_v2_backend.py similarity index 97% rename from docling/backend/docling_parse_v2_backend.py rename to docowling/backend/docling_parse_v2_backend.py index 12d7df55..3eb448c0 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docowling/backend/docling_parse_v2_backend.py @@ -10,11 +10,11 @@ from docling_parse.pdf_parsers import pdf_parser_v2 from PIL import Image, ImageDraw from pypdfium2 import PdfPage -from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell, Size +from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docowling.datamodel.base_models import Cell, Size if TYPE_CHECKING: - from docling.datamodel.document import InputDocument + from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/html_backend.py b/docowling/backend/html_backend.py similarity index 98% rename from docling/backend/html_backend.py rename to docowling/backend/html_backend.py index 9cd1e29b..46514c6e 100644 --- a/docling/backend/html_backend.py +++ b/docowling/backend/html_backend.py @@ -13,9 +13,9 @@ from docling_core.types.doc import ( TableData, ) -from docling.backend.abstract_backend import DeclarativeDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/md_backend.py b/docowling/backend/md_backend.py similarity index 98% rename from docling/backend/md_backend.py rename to docowling/backend/md_backend.py index 2bcc6d7d..b7b7010c 100644 --- a/docling/backend/md_backend.py +++ b/docowling/backend/md_backend.py @@ -19,9 +19,9 @@ from docling_core.types.doc import ( ) from marko import Markdown -from docling.backend.abstract_backend import DeclarativeDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/msexcel_backend.py b/docowling/backend/msexcel_backend.py similarity index 98% rename from docling/backend/msexcel_backend.py rename to docowling/backend/msexcel_backend.py index 508b0e8d..409ac67e 100644 --- a/docling/backend/msexcel_backend.py +++ b/docowling/backend/msexcel_backend.py @@ -18,9 +18,9 @@ from openpyxl.cell.cell import Cell from openpyxl.drawing.image import Image from openpyxl.worksheet.worksheet import Worksheet -from docling.backend.abstract_backend import DeclarativeDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/mspowerpoint_backend.py b/docowling/backend/mspowerpoint_backend.py similarity index 98% rename from docling/backend/mspowerpoint_backend.py rename to docowling/backend/mspowerpoint_backend.py index f595e4bd..f28c4ac5 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docowling/backend/mspowerpoint_backend.py @@ -20,12 +20,12 @@ from PIL import Image from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER -from docling.backend.abstract_backend import ( +from docowling.backend.abstract_backend import ( DeclarativeDocumentBackend, PaginatedDocumentBackend, ) -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/msword_backend.py b/docowling/backend/msword_backend.py similarity index 99% rename from docling/backend/msword_backend.py rename to docowling/backend/msword_backend.py index f59356e2..f19db738 100644 --- a/docling/backend/msword_backend.py +++ b/docowling/backend/msword_backend.py @@ -18,9 +18,9 @@ from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError -from docling.backend.abstract_backend import DeclarativeDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/pdf_backend.py b/docowling/backend/pdf_backend.py similarity index 90% rename from docling/backend/pdf_backend.py rename to docowling/backend/pdf_backend.py index cd7a0815..ad9b6621 100644 --- a/docling/backend/pdf_backend.py +++ b/docowling/backend/pdf_backend.py @@ -6,9 +6,9 @@ from typing import Iterable, Optional, Set, Union from docling_core.types.doc import BoundingBox, Size from PIL import Image -from docling.backend.abstract_backend import PaginatedDocumentBackend -from docling.datamodel.base_models import Cell, InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import PaginatedDocumentBackend +from docowling.datamodel.base_models import Cell, InputFormat +from docowling.datamodel.document import InputDocument class PdfPageBackend(ABC): diff --git a/docling/backend/pypdfium2_backend.py b/docowling/backend/pypdfium2_backend.py similarity index 97% rename from docling/backend/pypdfium2_backend.py rename to docowling/backend/pypdfium2_backend.py index d24ba608..cee1adeb 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docowling/backend/pypdfium2_backend.py @@ -11,11 +11,11 @@ from PIL import Image, ImageDraw from pypdfium2 import PdfTextPage from pypdfium2._helpers.misc import PdfiumError -from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell +from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docowling.datamodel.base_models import Cell if TYPE_CHECKING: - from docling.datamodel.document import InputDocument + from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/xml/__init__.py b/docowling/backend/xml/__init__.py similarity index 100% rename from docling/backend/xml/__init__.py rename to docowling/backend/xml/__init__.py diff --git a/docling/backend/xml/pubmed_backend.py b/docowling/backend/xml/pubmed_backend.py old mode 100755 new mode 100644 similarity index 99% rename from docling/backend/xml/pubmed_backend.py rename to docowling/backend/xml/pubmed_backend.py index acbcd4e1..b5d2bdb6 --- a/docling/backend/xml/pubmed_backend.py +++ b/docowling/backend/xml/pubmed_backend.py @@ -16,9 +16,9 @@ from docling_core.types.doc import ( from lxml import etree from typing_extensions import TypedDict, override -from docling.backend.abstract_backend import DeclarativeDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/backend/xml/uspto_backend.py b/docowling/backend/xml/uspto_backend.py similarity index 99% rename from docling/backend/xml/uspto_backend.py rename to docowling/backend/xml/uspto_backend.py index ef253b21..14c2604b 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docowling/backend/xml/uspto_backend.py @@ -30,9 +30,9 @@ from docling_core.types.doc.document import LevelNumber from pydantic import NonNegativeInt from typing_extensions import Self, TypedDict, override -from docling.backend.abstract_backend import DeclarativeDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.abstract_backend import DeclarativeDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docling/chunking/__init__.py b/docowling/chunking/__init__.py similarity index 100% rename from docling/chunking/__init__.py rename to docowling/chunking/__init__.py diff --git a/docling/cli/__init__.py b/docowling/cli/__init__.py similarity index 100% rename from docling/cli/__init__.py rename to docowling/cli/__init__.py diff --git a/docling/cli/main.py b/docowling/cli/main.py similarity index 96% rename from docling/cli/main.py rename to docowling/cli/main.py index a83aecbf..05f90906 100644 --- a/docling/cli/main.py +++ b/docowling/cli/main.py @@ -14,18 +14,18 @@ from docling_core.types.doc import ImageRefMode from docling_core.utils.file import resolve_source_to_path from pydantic import TypeAdapter, ValidationError -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend -from docling.backend.pdf_backend import PdfDocumentBackend -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import ( +from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docowling.backend.pdf_backend import PdfDocumentBackend +from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docowling.datamodel.base_models import ( ConversionStatus, FormatToExtensions, InputFormat, OutputFormat, ) -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, EasyOcrOptions, @@ -39,8 +39,8 @@ from docling.datamodel.pipeline_options import ( TesseractCliOcrOptions, TesseractOcrOptions, ) -from docling.datamodel.settings import settings -from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption +from docowling.datamodel.settings import settings +from docowling.document_converter import DocumentConverter, FormatOption, PdfFormatOption warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") diff --git a/docling/datamodel/__init__.py b/docowling/datamodel/__init__.py similarity index 100% rename from docling/datamodel/__init__.py rename to docowling/datamodel/__init__.py diff --git a/docling/datamodel/base_models.py b/docowling/datamodel/base_models.py similarity index 97% rename from docling/datamodel/base_models.py rename to docowling/datamodel/base_models.py index 5bd28ed6..a904bd1b 100644 --- a/docling/datamodel/base_models.py +++ b/docowling/datamodel/base_models.py @@ -15,7 +15,7 @@ from PIL.Image import Image from pydantic import BaseModel, ConfigDict if TYPE_CHECKING: - from docling.backend.pdf_backend import PdfPageBackend + from docowling.backend.pdf_backend import PdfPageBackend class ConversionStatus(str, Enum): @@ -39,6 +39,7 @@ class InputFormat(str, Enum): ASCIIDOC = "asciidoc" MD = "md" XLSX = "xlsx" + CSV = "csv" XML_USPTO = "xml_uspto" @@ -60,6 +61,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.XLSX: ["xlsx"], + InputFormat.CSV: ["csv"], InputFormat.XML_USPTO: ["xml", "txt"], } @@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { InputFormat.XLSX: [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], + InputFormat.CSV: ["text/csv"], InputFormat.XML_USPTO: ["application/xml", "text/plain"], } diff --git a/docling/datamodel/document.py b/docowling/datamodel/document.py similarity index 97% rename from docling/datamodel/document.py rename to docowling/datamodel/document.py index 4ed7d577..2e3f7b85 100644 --- a/docling/datamodel/document.py +++ b/docowling/datamodel/document.py @@ -47,11 +47,11 @@ from docling_core.utils.legacy import docling_document_to_legacy from pydantic import BaseModel from typing_extensions import deprecated -from docling.backend.abstract_backend import ( +from docowling.backend.abstract_backend import ( AbstractDocumentBackend, PaginatedDocumentBackend, ) -from docling.datamodel.base_models import ( +from docowling.datamodel.base_models import ( AssembledUnit, ConversionStatus, DocumentStream, @@ -62,12 +62,12 @@ from docling.datamodel.base_models import ( MimeTypeToFormat, Page, ) -from docling.datamodel.settings import DocumentLimits -from docling.utils.profiling import ProfilingItem -from docling.utils.utils import create_file_hash, create_hash +from docowling.datamodel.settings import DocumentLimits +from docowling.utils.profiling import ProfilingItem +from docowling.utils.utils import create_file_hash, create_hash if TYPE_CHECKING: - from docling.document_converter import FormatOption + from docowling.document_converter import FormatOption _log = logging.getLogger(__name__) diff --git a/docling/datamodel/pipeline_options.py b/docowling/datamodel/pipeline_options.py similarity index 100% rename from docling/datamodel/pipeline_options.py rename to docowling/datamodel/pipeline_options.py diff --git a/docling/datamodel/settings.py b/docowling/datamodel/settings.py similarity index 100% rename from docling/datamodel/settings.py rename to docowling/datamodel/settings.py diff --git a/docling/document_converter.py b/docowling/document_converter.py similarity index 87% rename from docling/document_converter.py rename to docowling/document_converter.py index c9cbedd2..4106be79 100644 --- a/docling/document_converter.py +++ b/docowling/document_converter.py @@ -7,35 +7,36 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union from pydantic import BaseModel, ConfigDict, model_validator, validate_call -from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.asciidoc_backend import AsciiDocBackend -from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend -from docling.backend.html_backend import HTMLDocumentBackend -from docling.backend.md_backend import MarkdownDocumentBackend -from docling.backend.msexcel_backend import MsExcelDocumentBackend -from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend -from docling.backend.msword_backend import MsWordDocumentBackend -from docling.backend.xml.pubmed_backend import PubMedDocumentBackend -from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend -from docling.datamodel.base_models import ( +from docowling.backend.abstract_backend import AbstractDocumentBackend +from docowling.backend.asciidoc_backend import AsciiDocBackend +from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docowling.backend.html_backend import HTMLDocumentBackend +from docowling.backend.md_backend import MarkdownDocumentBackend +from docowling.backend.msexcel_backend import MsExcelDocumentBackend +from docowling.backend.csv_backend import CsvDocumentBackend +from docowling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend +from docowling.backend.msword_backend import MsWordDocumentBackend +from docowling.backend.xml.pubmed_backend import PubMedDocumentBackend +from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend +from docowling.datamodel.base_models import ( ConversionStatus, DoclingComponentType, DocumentStream, ErrorItem, InputFormat, ) -from docling.datamodel.document import ( +from docowling.datamodel.document import ( ConversionResult, InputDocument, _DocumentConversionInput, ) -from docling.datamodel.pipeline_options import PipelineOptions -from docling.datamodel.settings import DocumentLimits, settings -from docling.exceptions import ConversionError -from docling.pipeline.base_pipeline import BasePipeline -from docling.pipeline.simple_pipeline import SimplePipeline -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline -from docling.utils.utils import chunkify +from docowling.datamodel.pipeline_options import PipelineOptions +from docowling.datamodel.settings import DocumentLimits, settings +from docowling.exceptions import ConversionError +from docowling.pipeline.base_pipeline import BasePipeline +from docowling.pipeline.simple_pipeline import SimplePipeline +from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +from docowling.utils.utils import chunkify _log = logging.getLogger(__name__) @@ -58,6 +59,9 @@ class ExcelFormatOption(FormatOption): pipeline_cls: Type = SimplePipeline backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend +class CsvFormatOption(FormatOption): + pipeline_cls: Type = SimplePipeline + backend: Type[AbstractDocumentBackend] = CsvDocumentBackend class WordFormatOption(FormatOption): pipeline_cls: Type = SimplePipeline @@ -109,6 +113,9 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.XLSX: FormatOption( pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend ), + InputFormat.CSV: FormatOption( + pipeline_cls=SimplePipeline, backend=CsvDocumentBackend + ), InputFormat.DOCX: FormatOption( pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend ), diff --git a/docling/exceptions.py b/docowling/exceptions.py similarity index 100% rename from docling/exceptions.py rename to docowling/exceptions.py diff --git a/docling/models/__init__.py b/docowling/models/__init__.py similarity index 100% rename from docling/models/__init__.py rename to docowling/models/__init__.py diff --git a/docling/models/base_model.py b/docowling/models/base_model.py similarity index 84% rename from docling/models/base_model.py rename to docowling/models/base_model.py index 1147896c..f6ae9fa0 100644 --- a/docling/models/base_model.py +++ b/docowling/models/base_model.py @@ -3,8 +3,8 @@ from typing import Any, Iterable from docling_core.types.doc import DoclingDocument, NodeItem -from docling.datamodel.base_models import Page -from docling.datamodel.document import ConversionResult +from docowling.datamodel.base_models import Page +from docowling.datamodel.document import ConversionResult class BasePageModel(ABC): diff --git a/docling/models/base_ocr_model.py b/docowling/models/base_ocr_model.py similarity index 95% rename from docling/models/base_ocr_model.py rename to docowling/models/base_ocr_model.py index 38b5e52c..3b96f7b2 100644 --- a/docling/models/base_ocr_model.py +++ b/docowling/models/base_ocr_model.py @@ -10,11 +10,11 @@ from PIL import Image, ImageDraw from rtree import index from scipy.ndimage import find_objects, label -from docling.datamodel.base_models import Cell, OcrCell, Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import OcrOptions -from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel +from docowling.datamodel.base_models import Cell, OcrCell, Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import OcrOptions +from docowling.datamodel.settings import settings +from docowling.models.base_model import BasePageModel _log = logging.getLogger(__name__) diff --git a/docling/models/ds_glm_model.py b/docowling/models/ds_glm_model.py similarity index 97% rename from docling/models/ds_glm_model.py rename to docowling/models/ds_glm_model.py index 6f7de07a..0c04c7a1 100644 --- a/docling/models/ds_glm_model.py +++ b/docowling/models/ds_glm_model.py @@ -24,18 +24,18 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu from PIL import ImageDraw from pydantic import BaseModel, ConfigDict, TypeAdapter -from docling.datamodel.base_models import ( +from docowling.datamodel.base_models import ( Cluster, ContainerElement, FigureElement, Table, TextElement, ) -from docling.datamodel.document import ConversionResult, layout_label_to_ds_type -from docling.datamodel.settings import settings -from docling.utils.glm_utils import to_docling_document -from docling.utils.profiling import ProfilingScope, TimeRecorder -from docling.utils.utils import create_hash +from docowling.datamodel.document import ConversionResult, layout_label_to_ds_type +from docowling.datamodel.settings import settings +from docowling.utils.glm_utils import to_docling_document +from docowling.utils.profiling import ProfilingScope, TimeRecorder +from docowling.utils.utils import create_hash class GlmOptions(BaseModel): diff --git a/docling/models/easyocr_model.py b/docowling/models/easyocr_model.py similarity index 92% rename from docling/models/easyocr_model.py rename to docowling/models/easyocr_model.py index bbe4fb05..74f70aed 100644 --- a/docling/models/easyocr_model.py +++ b/docowling/models/easyocr_model.py @@ -6,17 +6,17 @@ import numpy import torch from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import Cell, OcrCell, Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( +from docowling.datamodel.base_models import Cell, OcrCell, Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, EasyOcrOptions, ) -from docling.datamodel.settings import settings -from docling.models.base_ocr_model import BaseOcrModel -from docling.utils.accelerator_utils import decide_device -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.settings import settings +from docowling.models.base_ocr_model import BaseOcrModel +from docowling.utils.accelerator_utils import decide_device +from docowling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/layout_model.py b/docowling/models/layout_model.py similarity index 95% rename from docling/models/layout_model.py rename to docowling/models/layout_model.py index 014cddd3..96f46496 100644 --- a/docling/models/layout_model.py +++ b/docowling/models/layout_model.py @@ -9,20 +9,20 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from PIL import Image, ImageDraw, ImageFont -from docling.datamodel.base_models import ( +from docowling.datamodel.base_models import ( BoundingBox, Cell, Cluster, LayoutPrediction, Page, ) -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions -from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel -from docling.utils.accelerator_utils import decide_device -from docling.utils.layout_postprocessor import LayoutPostprocessor -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions +from docowling.datamodel.settings import settings +from docowling.models.base_model import BasePageModel +from docowling.utils.accelerator_utils import decide_device +from docowling.utils.layout_postprocessor import LayoutPostprocessor +from docowling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/ocr_mac_model.py b/docowling/models/ocr_mac_model.py similarity index 92% rename from docling/models/ocr_mac_model.py rename to docowling/models/ocr_mac_model.py index 38bcf1ca..965c9346 100644 --- a/docling/models/ocr_mac_model.py +++ b/docowling/models/ocr_mac_model.py @@ -4,12 +4,12 @@ from typing import Iterable, Optional, Tuple from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import OcrMacOptions -from docling.datamodel.settings import settings -from docling.models.base_ocr_model import BaseOcrModel -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.base_models import OcrCell, Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import OcrMacOptions +from docowling.datamodel.settings import settings +from docowling.models.base_ocr_model import BaseOcrModel +from docowling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/page_assemble_model.py b/docowling/models/page_assemble_model.py similarity index 96% rename from docling/models/page_assemble_model.py rename to docowling/models/page_assemble_model.py index 3e202e20..42ced247 100644 --- a/docling/models/page_assemble_model.py +++ b/docowling/models/page_assemble_model.py @@ -4,7 +4,7 @@ from typing import Iterable, List from pydantic import BaseModel -from docling.datamodel.base_models import ( +from docowling.datamodel.base_models import ( AssembledUnit, ContainerElement, FigureElement, @@ -13,10 +13,10 @@ from docling.datamodel.base_models import ( Table, TextElement, ) -from docling.datamodel.document import ConversionResult -from docling.models.base_model import BasePageModel -from docling.models.layout_model import LayoutModel -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.document import ConversionResult +from docowling.models.base_model import BasePageModel +from docowling.models.layout_model import LayoutModel +from docowling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/page_preprocessing_model.py b/docowling/models/page_preprocessing_model.py similarity index 90% rename from docling/models/page_preprocessing_model.py rename to docowling/models/page_preprocessing_model.py index 63f1a4f6..54dc51e0 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docowling/models/page_preprocessing_model.py @@ -4,11 +4,11 @@ from typing import Iterable, Optional from PIL import ImageDraw from pydantic import BaseModel -from docling.datamodel.base_models import Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.base_models import Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.settings import settings +from docowling.models.base_model import BasePageModel +from docowling.utils.profiling import TimeRecorder class PagePreprocessingOptions(BaseModel): diff --git a/docling/models/rapid_ocr_model.py b/docowling/models/rapid_ocr_model.py similarity index 92% rename from docling/models/rapid_ocr_model.py rename to docowling/models/rapid_ocr_model.py index 5882ffc7..c421da05 100644 --- a/docling/models/rapid_ocr_model.py +++ b/docowling/models/rapid_ocr_model.py @@ -4,17 +4,17 @@ from typing import Iterable import numpy from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( +from docowling.datamodel.base_models import OcrCell, Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, RapidOcrOptions, ) -from docling.datamodel.settings import settings -from docling.models.base_ocr_model import BaseOcrModel -from docling.utils.accelerator_utils import decide_device -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.settings import settings +from docowling.models.base_ocr_model import BaseOcrModel +from docowling.utils.accelerator_utils import decide_device +from docowling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/table_structure_model.py b/docowling/models/table_structure_model.py similarity index 95% rename from docling/models/table_structure_model.py rename to docowling/models/table_structure_model.py index ba306449..8225ef61 100644 --- a/docling/models/table_structure_model.py +++ b/docowling/models/table_structure_model.py @@ -7,18 +7,18 @@ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from PIL import ImageDraw -from docling.datamodel.base_models import Page, Table, TableStructurePrediction -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( +from docowling.datamodel.base_models import Page, Table, TableStructurePrediction +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, TableFormerMode, TableStructureOptions, ) -from docling.datamodel.settings import settings -from docling.models.base_model import BasePageModel -from docling.utils.accelerator_utils import decide_device -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.settings import settings +from docowling.models.base_model import BasePageModel +from docowling.utils.accelerator_utils import decide_device +from docowling.utils.profiling import TimeRecorder class TableStructureModel(BasePageModel): diff --git a/docling/models/tesseract_ocr_cli_model.py b/docowling/models/tesseract_ocr_cli_model.py similarity index 94% rename from docling/models/tesseract_ocr_cli_model.py rename to docowling/models/tesseract_ocr_cli_model.py index 16e1629d..9c981491 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docowling/models/tesseract_ocr_cli_model.py @@ -9,12 +9,12 @@ from typing import Iterable, Optional, Tuple import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import Cell, OcrCell, Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import TesseractCliOcrOptions -from docling.datamodel.settings import settings -from docling.models.base_ocr_model import BaseOcrModel -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.base_models import Cell, OcrCell, Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import TesseractCliOcrOptions +from docowling.datamodel.settings import settings +from docowling.models.base_ocr_model import BaseOcrModel +from docowling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/models/tesseract_ocr_model.py b/docowling/models/tesseract_ocr_model.py similarity index 94% rename from docling/models/tesseract_ocr_model.py rename to docowling/models/tesseract_ocr_model.py index b2bd358b..862ce238 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docowling/models/tesseract_ocr_model.py @@ -3,12 +3,12 @@ from typing import Iterable from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import Cell, OcrCell, Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import TesseractOcrOptions -from docling.datamodel.settings import settings -from docling.models.base_ocr_model import BaseOcrModel -from docling.utils.profiling import TimeRecorder +from docowling.datamodel.base_models import Cell, OcrCell, Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import TesseractOcrOptions +from docowling.datamodel.settings import settings +from docowling.models.base_ocr_model import BaseOcrModel +from docowling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/pipeline/__init__.py b/docowling/pipeline/__init__.py similarity index 100% rename from docling/pipeline/__init__.py rename to docowling/pipeline/__init__.py diff --git a/docling/pipeline/base_pipeline.py b/docowling/pipeline/base_pipeline.py similarity index 93% rename from docling/pipeline/base_pipeline.py rename to docowling/pipeline/base_pipeline.py index c75faaec..b9292c5b 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docowling/pipeline/base_pipeline.py @@ -7,20 +7,20 @@ from typing import Callable, Iterable, List from docling_core.types.doc import DoclingDocument, NodeItem -from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import ( +from docowling.backend.abstract_backend import AbstractDocumentBackend +from docowling.backend.pdf_backend import PdfDocumentBackend +from docowling.datamodel.base_models import ( ConversionStatus, DoclingComponentType, ErrorItem, Page, ) -from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import PipelineOptions -from docling.datamodel.settings import settings -from docling.models.base_model import BaseEnrichmentModel -from docling.utils.profiling import ProfilingScope, TimeRecorder -from docling.utils.utils import chunkify +from docowling.datamodel.document import ConversionResult, InputDocument +from docowling.datamodel.pipeline_options import PipelineOptions +from docowling.datamodel.settings import settings +from docowling.models.base_model import BaseEnrichmentModel +from docowling.utils.profiling import ProfilingScope, TimeRecorder +from docowling.utils.utils import chunkify _log = logging.getLogger(__name__) diff --git a/docling/pipeline/simple_pipeline.py b/docowling/pipeline/simple_pipeline.py similarity index 84% rename from docling/pipeline/simple_pipeline.py rename to docowling/pipeline/simple_pipeline.py index fb985231..98b97372 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docowling/pipeline/simple_pipeline.py @@ -1,14 +1,14 @@ import logging -from docling.backend.abstract_backend import ( +from docowling.backend.abstract_backend import ( AbstractDocumentBackend, DeclarativeDocumentBackend, ) -from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PipelineOptions -from docling.pipeline.base_pipeline import BasePipeline -from docling.utils.profiling import ProfilingScope, TimeRecorder +from docowling.datamodel.base_models import ConversionStatus +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import PipelineOptions +from docowling.pipeline.base_pipeline import BasePipeline +from docowling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docowling/pipeline/standard_pdf_pipeline.py similarity index 87% rename from docling/pipeline/standard_pdf_pipeline.py rename to docowling/pipeline/standard_pdf_pipeline.py index 2f8c1421..f72de68b 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docowling/pipeline/standard_pdf_pipeline.py @@ -5,11 +5,11 @@ from typing import Optional from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem -from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import AssembledUnit, Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( +from docowling.backend.abstract_backend import AbstractDocumentBackend +from docowling.backend.pdf_backend import PdfDocumentBackend +from docowling.datamodel.base_models import AssembledUnit, Page +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.pipeline_options import ( EasyOcrOptions, OcrMacOptions, PdfPipelineOptions, @@ -17,22 +17,22 @@ from docling.datamodel.pipeline_options import ( TesseractCliOcrOptions, TesseractOcrOptions, ) -from docling.models.base_ocr_model import BaseOcrModel -from docling.models.ds_glm_model import GlmModel, GlmOptions -from docling.models.easyocr_model import EasyOcrModel -from docling.models.layout_model import LayoutModel -from docling.models.ocr_mac_model import OcrMacModel -from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions -from docling.models.page_preprocessing_model import ( +from docowling.models.base_ocr_model import BaseOcrModel +from docowling.models.ds_glm_model import GlmModel, GlmOptions +from docowling.models.easyocr_model import EasyOcrModel +from docowling.models.layout_model import LayoutModel +from docowling.models.ocr_mac_model import OcrMacModel +from docowling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions +from docowling.models.page_preprocessing_model import ( PagePreprocessingModel, PagePreprocessingOptions, ) -from docling.models.rapid_ocr_model import RapidOcrModel -from docling.models.table_structure_model import TableStructureModel -from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel -from docling.models.tesseract_ocr_model import TesseractOcrModel -from docling.pipeline.base_pipeline import PaginatedPipeline -from docling.utils.profiling import ProfilingScope, TimeRecorder +from docowling.models.rapid_ocr_model import RapidOcrModel +from docowling.models.table_structure_model import TableStructureModel +from docowling.models.tesseract_ocr_cli_model import TesseractOcrCliModel +from docowling.models.tesseract_ocr_model import TesseractOcrModel +from docowling.pipeline.base_pipeline import PaginatedPipeline +from docowling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) diff --git a/docling/py.typed b/docowling/py.typed similarity index 100% rename from docling/py.typed rename to docowling/py.typed diff --git a/docling/utils/__init__.py b/docowling/utils/__init__.py similarity index 100% rename from docling/utils/__init__.py rename to docowling/utils/__init__.py diff --git a/docling/utils/accelerator_utils.py b/docowling/utils/accelerator_utils.py similarity index 95% rename from docling/utils/accelerator_utils.py rename to docowling/utils/accelerator_utils.py index 59b04796..572a37b8 100644 --- a/docling/utils/accelerator_utils.py +++ b/docowling/utils/accelerator_utils.py @@ -2,7 +2,7 @@ import logging import torch -from docling.datamodel.pipeline_options import AcceleratorDevice +from docowling.datamodel.pipeline_options import AcceleratorDevice _log = logging.getLogger(__name__) diff --git a/docling/utils/export.py b/docowling/utils/export.py similarity index 97% rename from docling/utils/export.py rename to docowling/utils/export.py index 5b022f4a..98d7e302 100644 --- a/docling/utils/export.py +++ b/docowling/utils/export.py @@ -4,8 +4,8 @@ from typing import Any, Dict, Iterable, List, Tuple, Union from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table -from docling.datamodel.base_models import OcrCell -from docling.datamodel.document import ConversionResult, Page +from docowling.datamodel.base_models import OcrCell +from docowling.datamodel.document import ConversionResult, Page _log = logging.getLogger(__name__) diff --git a/docling/utils/glm_utils.py b/docowling/utils/glm_utils.py similarity index 100% rename from docling/utils/glm_utils.py rename to docowling/utils/glm_utils.py diff --git a/docling/utils/layout_postprocessor.py b/docowling/utils/layout_postprocessor.py similarity index 99% rename from docling/utils/layout_postprocessor.py rename to docowling/utils/layout_postprocessor.py index 8cb6bc55..20e46613 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docowling/utils/layout_postprocessor.py @@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple from docling_core.types.doc import DocItemLabel, Size from rtree import index -from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell +from docowling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell _log = logging.getLogger(__name__) diff --git a/docling/utils/profiling.py b/docowling/utils/profiling.py similarity index 93% rename from docling/utils/profiling.py rename to docowling/utils/profiling.py index 0d09f17d..1b350fca 100644 --- a/docling/utils/profiling.py +++ b/docowling/utils/profiling.py @@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, List import numpy as np from pydantic import BaseModel -from docling.datamodel.settings import settings +from docowling.datamodel.settings import settings if TYPE_CHECKING: - from docling.datamodel.document import ConversionResult + from docowling.datamodel.document import ConversionResult class ProfilingScope(str, Enum): diff --git a/docling/utils/utils.py b/docowling/utils/utils.py similarity index 100% rename from docling/utils/utils.py rename to docowling/utils/utils.py diff --git a/docs/concepts/chunking.md b/docs/concepts/chunking.md index bed8bce3..6a17cc6d 100644 --- a/docs/concepts/chunking.md +++ b/docs/concepts/chunking.md @@ -28,7 +28,7 @@ The `BaseChunker` base class API defines that any chunker should provide the fol - If you are using the `docling` package, you can import as follows: ```python - from docling.chunking import HybridChunker + from docowling.chunking import HybridChunker ``` - If you are only using the `docling-core` package, you must ensure to install the `chunking` extra, e.g. diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index f6ad92bd..b654664b 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -6,10 +6,10 @@ from typing import Iterable import yaml -from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult -from docling.datamodel.settings import settings -from docling.document_converter import DocumentConverter +from docowling.datamodel.base_models import ConversionStatus +from docowling.datamodel.document import ConversionResult +from docowling.datamodel.settings import settings +from docowling.document_converter import DocumentConverter _log = logging.getLogger(__name__) diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index a7efa975..dd787b33 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -3,13 +3,13 @@ import logging import time from pathlib import Path -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.models.ocr_mac_model import OcrMacOptions -from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions -from docling.models.tesseract_ocr_model import TesseractOcrOptions +from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.pipeline_options import PdfPipelineOptions +from docowling.document_converter import DocumentConverter, PdfFormatOption +from docowling.models.ocr_mac_model import OcrMacOptions +from docowling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions +from docowling.models.tesseract_ocr_model import TesseractOcrOptions _log = logging.getLogger(__name__) diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py index 7ad06e4a..70e95ed4 100644 --- a/docs/examples/develop_picture_enrichment.py +++ b/docs/examples/develop_picture_enrichment.py @@ -10,11 +10,11 @@ from docling_core.types.doc import ( PictureItem, ) -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.models.base_model import BaseEnrichmentModel -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.pipeline_options import PdfPipelineOptions +from docowling.document_converter import DocumentConverter, PdfFormatOption +from docowling.models.base_model import BaseEnrichmentModel +from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions): diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py index b2ecc43f..2fe3cb93 100644 --- a/docs/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -4,9 +4,9 @@ from pathlib import Path from docling_core.types.doc import ImageRefMode, PictureItem, TableItem -from docling.datamodel.base_models import FigureElement, InputFormat, Table -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption +from docowling.datamodel.base_models import FigureElement, InputFormat, Table +from docowling.datamodel.pipeline_options import PdfPipelineOptions +from docowling.document_converter import DocumentConverter, PdfFormatOption _log = logging.getLogger(__name__) diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py index 09885bd3..4b1e2f17 100644 --- a/docs/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -5,11 +5,11 @@ from pathlib import Path import pandas as pd -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.utils.export import generate_multimodal_pages -from docling.utils.utils import create_hash +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.pipeline_options import PdfPipelineOptions +from docowling.document_converter import DocumentConverter, PdfFormatOption +from docowling.utils.export import generate_multimodal_pages +from docowling.utils.utils import create_hash _log = logging.getLogger(__name__) diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py index 68b9ce47..96ac7f95 100644 --- a/docs/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -4,7 +4,7 @@ from pathlib import Path import pandas as pd -from docling.document_converter import DocumentConverter +from docowling.document_converter import DocumentConverter _log = logging.getLogger(__name__) diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py index 967910dc..9717844b 100644 --- a/docs/examples/full_page_ocr.py +++ b/docs/examples/full_page_ocr.py @@ -1,8 +1,8 @@ from pathlib import Path -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( +from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.pipeline_options import ( EasyOcrOptions, OcrMacOptions, PdfPipelineOptions, @@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import ( TesseractCliOcrOptions, TesseractOcrOptions, ) -from docling.document_converter import DocumentConverter, PdfFormatOption +from docowling.document_converter import DocumentConverter, PdfFormatOption def main(): diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb index 6f097a8f..7d4e798d 100644 --- a/docs/examples/hybrid_chunking.ipynb +++ b/docs/examples/hybrid_chunking.ipynb @@ -37,7 +37,7 @@ "metadata": {}, "outputs": [], "source": [ - "from docling.document_converter import DocumentConverter\n", + "from docowling.document_converter import DocumentConverter\n", "\n", "DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n", "\n", @@ -68,7 +68,7 @@ "source": [ "from transformers import AutoTokenizer\n", "\n", - "from docling.chunking import HybridChunker\n", + "from docowling.chunking import HybridChunker\n", "\n", "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "MAX_TOKENS = 64\n", @@ -404,7 +404,7 @@ " return tbl\n", "\n", "\n", - "db_uri = str(Path(mkdtemp()) / \"docling.db\")\n", + "db_uri = str(Path(mkdtemp()) / \"docowling.db\")\n", "index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n", "\n", "sample_query = \"invent\"\n", diff --git a/docs/examples/hybrid_rag_qdrant.ipynb b/docs/examples/hybrid_rag_qdrant.ipynb index bbc8e575..4314eeda 100644 --- a/docs/examples/hybrid_rag_qdrant.ipynb +++ b/docs/examples/hybrid_rag_qdrant.ipynb @@ -81,8 +81,8 @@ "from docling_core.transforms.chunker import HierarchicalChunker\n", "from qdrant_client import QdrantClient\n", "\n", - "from docling.datamodel.base_models import InputFormat\n", - "from docling.document_converter import DocumentConverter" + "from docowling.datamodel.base_models import InputFormat\n", + "from docowling.document_converter import DocumentConverter" ] }, { diff --git a/docs/examples/minimal.py b/docs/examples/minimal.py index 66bd2c85..38e6dcf6 100644 --- a/docs/examples/minimal.py +++ b/docs/examples/minimal.py @@ -1,4 +1,4 @@ -from docling.document_converter import DocumentConverter +from docowling.document_converter import DocumentConverter source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL converter = DocumentConverter() diff --git a/docs/examples/rag_haystack.ipynb b/docs/examples/rag_haystack.ipynb index e2dc380d..e88559eb 100644 --- a/docs/examples/rag_haystack.ipynb +++ b/docs/examples/rag_haystack.ipynb @@ -110,7 +110,7 @@ "EXPORT_TYPE = ExportType.DOC_CHUNKS\n", "QUESTION = \"Which are the main AI models in Docling?\"\n", "TOP_K = 3\n", - "MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")" + "MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")" ] }, { @@ -168,7 +168,7 @@ "from haystack.components.writers import DocumentWriter\n", "from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n", "\n", - "from docling.chunking import HybridChunker\n", + "from docowling.chunking import HybridChunker\n", "\n", "document_store = MilvusDocumentStore(\n", " connection_args={\"uri\": MILVUS_URI},\n", @@ -329,7 +329,7 @@ } ], "source": [ - "from docling.chunking import DocChunk\n", + "from docowling.chunking import DocChunk\n", "\n", "print(f\"Question:\\n{QUESTION}\\n\")\n", "print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n", diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb index 31ff009a..ea1ff330 100644 --- a/docs/examples/rag_langchain.ipynb +++ b/docs/examples/rag_langchain.ipynb @@ -83,7 +83,7 @@ "from langchain_core.document_loaders import BaseLoader\n", "from langchain_core.documents import Document as LCDocument\n", "\n", - "from docling.document_converter import DocumentConverter\n", + "from docowling.document_converter import DocumentConverter\n", "\n", "class DoclingPDFLoader(BaseLoader):\n", "\n", diff --git a/docs/examples/rag_llamaindex.ipynb b/docs/examples/rag_llamaindex.ipynb index 0252bc4f..9aa562ca 100644 --- a/docs/examples/rag_llamaindex.ipynb +++ b/docs/examples/rag_llamaindex.ipynb @@ -117,7 +117,7 @@ "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n", - "MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n", + "MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")\n", "GEN_MODEL = HuggingFaceInferenceAPI(\n", " token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n", " model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n", @@ -182,7 +182,7 @@ "node_parser = MarkdownNodeParser()\n", "\n", "vector_store = MilvusVectorStore(\n", - " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", + " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n", " dim=embed_dim,\n", " overwrite=True,\n", ")\n", @@ -282,7 +282,7 @@ "node_parser = DoclingNodeParser()\n", "\n", "vector_store = MilvusVectorStore(\n", - " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", + " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n", " dim=embed_dim,\n", " overwrite=True,\n", ")\n", @@ -423,7 +423,7 @@ ")\n", "\n", "vector_store = MilvusVectorStore(\n", - " uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n", + " uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n", " dim=embed_dim,\n", " overwrite=True,\n", ")\n", diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb index 7f897d9e..b9f48771 100644 --- a/docs/examples/rag_weaviate.ipynb +++ b/docs/examples/rag_weaviate.ipynb @@ -207,8 +207,8 @@ } ], "source": [ - "from docling.datamodel.document import ConversionResult\n", - "from docling.document_converter import DocumentConverter\n", + "from docowling.datamodel.document import ConversionResult\n", + "from docowling.document_converter import DocumentConverter\n", "\n", "# Instantiate the doc converter\n", "doc_converter = DocumentConverter()\n", diff --git a/docs/examples/run_md.py b/docs/examples/run_md.py index 46be97e2..aa286dc3 100644 --- a/docs/examples/run_md.py +++ b/docs/examples/run_md.py @@ -5,9 +5,9 @@ from pathlib import Path import yaml -from docling.backend.md_backend import MarkdownDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docowling.backend.md_backend import MarkdownDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.document import InputDocument _log = logging.getLogger(__name__) diff --git a/docs/examples/run_with_accelerator.py b/docs/examples/run_with_accelerator.py index 5985401d..9665313f 100644 --- a/docs/examples/run_with_accelerator.py +++ b/docs/examples/run_with_accelerator.py @@ -1,16 +1,16 @@ from pathlib import Path -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( +from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) -from docling.datamodel.settings import settings -from docling.document_converter import DocumentConverter, PdfFormatOption +from docowling.datamodel.settings import settings +from docowling.document_converter import DocumentConverter, PdfFormatOption def main(): diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py index 7bd27de5..72d8ba5e 100644 --- a/docs/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -4,15 +4,15 @@ from pathlib import Path import yaml -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import InputFormat -from docling.document_converter import ( +from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docowling.datamodel.base_models import InputFormat +from docowling.document_converter import ( DocumentConverter, PdfFormatOption, WordFormatOption, ) -from docling.pipeline.simple_pipeline import SimplePipeline -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline +from docowling.pipeline.simple_pipeline import SimplePipeline +from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline _log = logging.getLogger(__name__) diff --git a/docs/faq.md b/docs/faq.md index 8c8e4793..529f7b93 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -140,7 +140,7 @@ This is a collection of FAQ collected from the user questions on int: