MO-01 - Adding CSV backend support

This commit is contained in:
matheus 2024-12-28 14:14:46 -03:00
parent 447802b5d1
commit 4e17a51cf6
100 changed files with 460 additions and 350 deletions

View File

@ -2,14 +2,9 @@
# Visit https://bit.ly/cffinit to generate yours today!
cff-version: 1.2.0
title: Docling
message: 'If you use Docling, please consider citing as below.'
title: Dockowling
message: 'If you use Dockowling, please consider citing as below.'
type: software
authors:
- name: Docling Team
identifiers:
- type: url
value: 'https://arxiv.org/abs/2408.09869'
description: 'arXiv:2408.09869'
repository-code: 'https://github.com/DS4SD/docling'
- name: Docowling
license: MIT

View File

@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
COPY docs/examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
RUN python -c 'from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4

View File

@ -1,5 +1,5 @@
<p align="center">
<a href="https://github.com/ds4sd/docling">
<a href="https://github.com/mouraworks/docowling">
<img loading="lazy" alt="Docling" src="https://github.com/mouraworks/docowling/blob/main/docs/assets/docowling.png" width="80%"/>
</a>
</p>

View File

@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Set, Union
from docling_core.types.doc import DoclingDocument
if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
class AbstractDocumentBackend(ABC):

View File

@ -16,9 +16,9 @@ from docling_core.types.doc import (
TableData,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -0,0 +1,105 @@
import csv
from io import StringIO
from pathlib import Path
from typing import Union, Dict, Tuple, List
from docling_core.types.doc import (
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableData,
TableCell,
)
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
class CsvDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[StringIO, Path]):
super().__init__(in_doc, path_or_stream)
self.rows = []
try:
# Load the CSV data
if isinstance(self.path_or_stream, Path):
with self.path_or_stream.open(mode="r", encoding="utf-8") as file:
self.rows = list(csv.reader(file))
elif isinstance(self.path_or_stream, StringIO):
self.rows = list(csv.reader(self.path_or_stream))
self.valid = True
except Exception as e:
self.valid = False
raise RuntimeError(
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
) from e
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False # Typically, CSV files do not support pagination.
def unload(self):
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.CSV}
def convert(self) -> DoclingDocument:
origin = DocumentOrigin(
filename=self.file.name or "file.csv",
mimetype="text/csv",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
if self.is_valid():
doc = self._convert_csv_to_document(doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc
def _convert_csv_to_document(self, doc: DoclingDocument) -> DoclingDocument:
if not self.rows:
return doc # No data to process
# Create a section for the CSV data
self.parents[0] = doc.add_group(
parent=None,
label=GroupLabel.SECTION,
name="CSV Data",
)
# Convert rows into table data
num_rows = len(self.rows)
num_cols = max(len(row) for row in self.rows)
table_data = TableData(
num_rows=num_rows,
num_cols=num_cols,
table_cells=[],
)
for row_idx, row in enumerate(self.rows):
for col_idx, cell in enumerate(row):
table_cell = TableCell(
text=cell,
row_span=1,
col_span=1,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + 1,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + 1,
col_header=False,
row_header=False,
)
table_data.table_cells.append(table_cell)
doc.add_table(data=table_data, parent=self.parents[0])
return doc

View File

@ -10,9 +10,9 @@ from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docling.datamodel.document import InputDocument
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docowling.datamodel.base_models import Cell
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -10,11 +10,11 @@ from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docowling.datamodel.base_models import Cell, Size
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -13,9 +13,9 @@ from docling_core.types.doc import (
TableData,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -19,9 +19,9 @@ from docling_core.types.doc import (
)
from marko import Markdown
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -18,9 +18,9 @@ from openpyxl.cell.cell import Cell
from openpyxl.drawing.image import Image
from openpyxl.worksheet.worksheet import Worksheet
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -20,12 +20,12 @@ from PIL import Image
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from docling.backend.abstract_backend import (
from docowling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -18,9 +18,9 @@ from lxml import etree
from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -6,9 +6,9 @@ from typing import Iterable, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import PaginatedDocumentBackend
from docowling.datamodel.base_models import Cell, InputFormat
from docowling.datamodel.document import InputDocument
class PdfPageBackend(ABC):

View File

@ -11,11 +11,11 @@ from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docowling.datamodel.base_models import Cell
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -16,9 +16,9 @@ from docling_core.types.doc import (
from lxml import etree
from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -30,9 +30,9 @@ from docling_core.types.doc.document import LevelNumber
from pydantic import NonNegativeInt
from typing_extensions import Self, TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -14,18 +14,18 @@ from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter, ValidationError
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docowling.backend.pdf_backend import PdfDocumentBackend
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docowling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
InputFormat,
OutputFormat,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
@ -39,8 +39,8 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docowling.datamodel.settings import settings
from docowling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

View File

@ -15,7 +15,7 @@ from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend
from docowling.backend.pdf_backend import PdfPageBackend
class ConversionStatus(str, Enum):
@ -39,6 +39,7 @@ class InputFormat(str, Enum):
ASCIIDOC = "asciidoc"
MD = "md"
XLSX = "xlsx"
CSV = "csv"
XML_USPTO = "xml_uspto"
@ -60,6 +61,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
InputFormat.CSV: ["csv"],
InputFormat.XML_USPTO: ["xml", "txt"],
}
@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.CSV: ["text/csv"],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
}

View File

@ -47,11 +47,11 @@ from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import (
from docowling.backend.abstract_backend import (
AbstractDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import (
from docowling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
DocumentStream,
@ -62,12 +62,12 @@ from docling.datamodel.base_models import (
MimeTypeToFormat,
Page,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash
from docowling.datamodel.settings import DocumentLimits
from docowling.utils.profiling import ProfilingItem
from docowling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING:
from docling.document_converter import FormatOption
from docowling.document_converter import FormatOption
_log = logging.getLogger(__name__)

View File

@ -7,35 +7,36 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
from docowling.backend.abstract_backend import AbstractDocumentBackend
from docowling.backend.asciidoc_backend import AsciiDocBackend
from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docowling.backend.html_backend import HTMLDocumentBackend
from docowling.backend.md_backend import MarkdownDocumentBackend
from docowling.backend.msexcel_backend import MsExcelDocumentBackend
from docowling.backend.csv_backend import CsvDocumentBackend
from docowling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docowling.backend.msword_backend import MsWordDocumentBackend
from docowling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docowling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
DocumentStream,
ErrorItem,
InputFormat,
)
from docling.datamodel.document import (
from docowling.datamodel.document import (
ConversionResult,
InputDocument,
_DocumentConversionInput,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings
from docling.exceptions import ConversionError
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.utils.utils import chunkify
from docowling.datamodel.pipeline_options import PipelineOptions
from docowling.datamodel.settings import DocumentLimits, settings
from docowling.exceptions import ConversionError
from docowling.pipeline.base_pipeline import BasePipeline
from docowling.pipeline.simple_pipeline import SimplePipeline
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docowling.utils.utils import chunkify
_log = logging.getLogger(__name__)
@ -58,6 +59,9 @@ class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
class CsvFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
@ -109,6 +113,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
),
InputFormat.CSV: FormatOption(
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
),
InputFormat.DOCX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
),

View File

@ -3,8 +3,8 @@ from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docowling.datamodel.base_models import Page
from docowling.datamodel.document import ConversionResult
class BasePageModel(ABC):

View File

@ -10,11 +10,11 @@ from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docowling.datamodel.base_models import Cell, OcrCell, Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import OcrOptions
from docowling.datamodel.settings import settings
from docowling.models.base_model import BasePageModel
_log = logging.getLogger(__name__)

View File

@ -24,18 +24,18 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict, TypeAdapter
from docling.datamodel.base_models import (
from docowling.datamodel.base_models import (
Cluster,
ContainerElement,
FigureElement,
Table,
TextElement,
)
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.glm_utils import to_docling_document
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash
from docowling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docowling.datamodel.settings import settings
from docowling.utils.glm_utils import to_docling_document
from docowling.utils.profiling import ProfilingScope, TimeRecorder
from docowling.utils.utils import create_hash
class GlmOptions(BaseModel):

View File

@ -6,17 +6,17 @@ import numpy
import torch
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docowling.datamodel.base_models import Cell, OcrCell, Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
EasyOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.settings import settings
from docowling.models.base_ocr_model import BaseOcrModel
from docowling.utils.accelerator_utils import decide_device
from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -9,20 +9,20 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import Image, ImageDraw, ImageFont
from docling.datamodel.base_models import (
from docowling.datamodel.base_models import (
BoundingBox,
Cell,
Cluster,
LayoutPrediction,
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
from docowling.datamodel.settings import settings
from docowling.models.base_model import BasePageModel
from docowling.utils.accelerator_utils import decide_device
from docowling.utils.layout_postprocessor import LayoutPostprocessor
from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -4,12 +4,12 @@ from typing import Iterable, Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrMacOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.base_models import OcrCell, Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import OcrMacOptions
from docowling.datamodel.settings import settings
from docowling.models.base_ocr_model import BaseOcrModel
from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -4,7 +4,7 @@ from typing import Iterable, List
from pydantic import BaseModel
from docling.datamodel.base_models import (
from docowling.datamodel.base_models import (
AssembledUnit,
ContainerElement,
FigureElement,
@ -13,10 +13,10 @@ from docling.datamodel.base_models import (
Table,
TextElement,
)
from docling.datamodel.document import ConversionResult
from docling.models.base_model import BasePageModel
from docling.models.layout_model import LayoutModel
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.document import ConversionResult
from docowling.models.base_model import BasePageModel
from docowling.models.layout_model import LayoutModel
from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -4,11 +4,11 @@ from typing import Iterable, Optional
from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.base_models import Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.settings import settings
from docowling.models.base_model import BasePageModel
from docowling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):

View File

@ -4,17 +4,17 @@ from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docowling.datamodel.base_models import OcrCell, Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
RapidOcrOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.settings import settings
from docowling.models.base_ocr_model import BaseOcrModel
from docowling.utils.accelerator_utils import decide_device
from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -7,18 +7,18 @@ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docowling.datamodel.base_models import Page, Table, TableStructurePrediction
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
TableFormerMode,
TableStructureOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.settings import settings
from docowling.models.base_model import BasePageModel
from docowling.utils.accelerator_utils import decide_device
from docowling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel):

View File

@ -9,12 +9,12 @@ from typing import Iterable, Optional, Tuple
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.base_models import Cell, OcrCell, Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import TesseractCliOcrOptions
from docowling.datamodel.settings import settings
from docowling.models.base_ocr_model import BaseOcrModel
from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -3,12 +3,12 @@ from typing import Iterable
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
from docowling.datamodel.base_models import Cell, OcrCell, Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import TesseractOcrOptions
from docowling.datamodel.settings import settings
from docowling.models.base_ocr_model import BaseOcrModel
from docowling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -7,20 +7,20 @@ from typing import Callable, Iterable, List
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
from docowling.backend.abstract_backend import AbstractDocumentBackend
from docowling.backend.pdf_backend import PdfDocumentBackend
from docowling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify
from docowling.datamodel.document import ConversionResult, InputDocument
from docowling.datamodel.pipeline_options import PipelineOptions
from docowling.datamodel.settings import settings
from docowling.models.base_model import BaseEnrichmentModel
from docowling.utils.profiling import ProfilingScope, TimeRecorder
from docowling.utils.utils import chunkify
_log = logging.getLogger(__name__)

View File

@ -1,14 +1,14 @@
import logging
from docling.backend.abstract_backend import (
from docowling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docowling.datamodel.base_models import ConversionStatus
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import PipelineOptions
from docowling.pipeline.base_pipeline import BasePipeline
from docowling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -5,11 +5,11 @@ from typing import Optional
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docowling.backend.abstract_backend import AbstractDocumentBackend
from docowling.backend.pdf_backend import PdfDocumentBackend
from docowling.datamodel.base_models import AssembledUnit, Page
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
@ -17,22 +17,22 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.ocr_mac_model import OcrMacModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import (
from docowling.models.base_ocr_model import BaseOcrModel
from docowling.models.ds_glm_model import GlmModel, GlmOptions
from docowling.models.easyocr_model import EasyOcrModel
from docowling.models.layout_model import LayoutModel
from docowling.models.ocr_mac_model import OcrMacModel
from docowling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docowling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docowling.models.rapid_ocr_model import RapidOcrModel
from docowling.models.table_structure_model import TableStructureModel
from docowling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docowling.models.tesseract_ocr_model import TesseractOcrModel
from docowling.pipeline.base_pipeline import PaginatedPipeline
from docowling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)

View File

@ -2,7 +2,7 @@ import logging
import torch
from docling.datamodel.pipeline_options import AcceleratorDevice
from docowling.datamodel.pipeline_options import AcceleratorDevice
_log = logging.getLogger(__name__)

View File

@ -4,8 +4,8 @@ from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.base_models import OcrCell
from docling.datamodel.document import ConversionResult, Page
from docowling.datamodel.base_models import OcrCell
from docowling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)

View File

@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
from docling_core.types.doc import DocItemLabel, Size
from rtree import index
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
from docowling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
_log = logging.getLogger(__name__)

View File

@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, List
import numpy as np
from pydantic import BaseModel
from docling.datamodel.settings import settings
from docowling.datamodel.settings import settings
if TYPE_CHECKING:
from docling.datamodel.document import ConversionResult
from docowling.datamodel.document import ConversionResult
class ProfilingScope(str, Enum):

View File

@ -28,7 +28,7 @@ The `BaseChunker` base class API defines that any chunker should provide the fol
- If you are using the `docling` package, you can import as follows:
```python
from docling.chunking import HybridChunker
from docowling.chunking import HybridChunker
```
- If you are only using the `docling-core` package, you must ensure to install
the `chunking` extra, e.g.

View File

@ -6,10 +6,10 @@ from typing import Iterable
import yaml
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter
from docowling.datamodel.base_models import ConversionStatus
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.settings import settings
from docowling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)

View File

@ -3,13 +3,13 @@ import logging
import time
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
from docowling.models.ocr_mac_model import OcrMacOptions
from docowling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docowling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__)

View File

@ -10,11 +10,11 @@ from docling_core.types.doc import (
PictureItem,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
from docowling.models.base_model import BaseEnrichmentModel
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):

View File

@ -4,9 +4,9 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.datamodel.base_models import FigureElement, InputFormat, Table
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)

View File

@ -5,11 +5,11 @@ from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
from docowling.utils.export import generate_multimodal_pages
from docowling.utils.utils import create_hash
_log = logging.getLogger(__name__)

View File

@ -4,7 +4,7 @@ from pathlib import Path
import pandas as pd
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)

View File

@ -1,8 +1,8 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.document_converter import DocumentConverter, PdfFormatOption
def main():

View File

@ -37,7 +37,7 @@
"metadata": {},
"outputs": [],
"source": [
"from docling.document_converter import DocumentConverter\n",
"from docowling.document_converter import DocumentConverter\n",
"\n",
"DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n",
"\n",
@ -68,7 +68,7 @@
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"from docling.chunking import HybridChunker\n",
"from docowling.chunking import HybridChunker\n",
"\n",
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
"MAX_TOKENS = 64\n",
@ -404,7 +404,7 @@
" return tbl\n",
"\n",
"\n",
"db_uri = str(Path(mkdtemp()) / \"docling.db\")\n",
"db_uri = str(Path(mkdtemp()) / \"docowling.db\")\n",
"index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n",
"\n",
"sample_query = \"invent\"\n",

View File

@ -81,8 +81,8 @@
"from docling_core.transforms.chunker import HierarchicalChunker\n",
"from qdrant_client import QdrantClient\n",
"\n",
"from docling.datamodel.base_models import InputFormat\n",
"from docling.document_converter import DocumentConverter"
"from docowling.datamodel.base_models import InputFormat\n",
"from docowling.document_converter import DocumentConverter"
]
},
{

View File

@ -1,4 +1,4 @@
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
converter = DocumentConverter()

View File

@ -110,7 +110,7 @@
"EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
"QUESTION = \"Which are the main AI models in Docling?\"\n",
"TOP_K = 3\n",
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")"
"MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")"
]
},
{
@ -168,7 +168,7 @@
"from haystack.components.writers import DocumentWriter\n",
"from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n",
"\n",
"from docling.chunking import HybridChunker\n",
"from docowling.chunking import HybridChunker\n",
"\n",
"document_store = MilvusDocumentStore(\n",
" connection_args={\"uri\": MILVUS_URI},\n",
@ -329,7 +329,7 @@
}
],
"source": [
"from docling.chunking import DocChunk\n",
"from docowling.chunking import DocChunk\n",
"\n",
"print(f\"Question:\\n{QUESTION}\\n\")\n",
"print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n",

View File

@ -83,7 +83,7 @@
"from langchain_core.document_loaders import BaseLoader\n",
"from langchain_core.documents import Document as LCDocument\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"from docowling.document_converter import DocumentConverter\n",
"\n",
"class DoclingPDFLoader(BaseLoader):\n",
"\n",

View File

@ -117,7 +117,7 @@
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
"\n",
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n",
"MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")\n",
"GEN_MODEL = HuggingFaceInferenceAPI(\n",
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
@ -182,7 +182,7 @@
"node_parser = MarkdownNodeParser()\n",
"\n",
"vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
@ -282,7 +282,7 @@
"node_parser = DoclingNodeParser()\n",
"\n",
"vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",
@ -423,7 +423,7 @@
")\n",
"\n",
"vector_store = MilvusVectorStore(\n",
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
" dim=embed_dim,\n",
" overwrite=True,\n",
")\n",

View File

@ -207,8 +207,8 @@
}
],
"source": [
"from docling.datamodel.document import ConversionResult\n",
"from docling.document_converter import DocumentConverter\n",
"from docowling.datamodel.document import ConversionResult\n",
"from docowling.document_converter import DocumentConverter\n",
"\n",
"# Instantiate the doc converter\n",
"doc_converter = DocumentConverter()\n",

View File

@ -5,9 +5,9 @@ from pathlib import Path
import yaml
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.md_backend import MarkdownDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)

View File

@ -1,16 +1,16 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.datamodel.settings import settings
from docowling.document_converter import DocumentConverter, PdfFormatOption
def main():

View File

@ -4,15 +4,15 @@ from pathlib import Path
import yaml
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docowling.pipeline.simple_pipeline import SimplePipeline
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)

View File

@ -140,7 +140,7 @@ This is a collection of FAQ collected from the user questions on <https://github
Setting the OCR language in Docling is done via the OCR pipeline options:
```py
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.datamodel.pipeline_options import PdfPipelineOptions
pipeline_options = PdfPipelineOptions()
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR

View File

@ -36,9 +36,9 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
```python
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
from docling.document_converter import DocumentConverter
from docowling.datamodel.base_models import ConversionStatus, PipelineOptions
from docowling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
from docowling.document_converter import DocumentConverter
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True

View File

@ -3,7 +3,7 @@
This page provides documentation for our command line tools.
::: mkdocs-click
:module: docling.cli.main
:module: docowling.cli.main
:command: click_app
:prog_name: docling
:style: table

View File

@ -2,7 +2,7 @@
This is an automatic generated API reference of the main components of Docling.
::: docling.document_converter
::: docowling.document_converter
handler: python
options:
members:

View File

@ -8,7 +8,7 @@ can be enabled with `do_xyz = True`.
This is an automatic generated API reference of the all the pipeline options available in Docling.
::: docling.datamodel.pipeline_options
::: docowling.datamodel.pipeline_options
handler: python
options:
show_if_no_docstring: true
@ -28,7 +28,7 @@ This is an automatic generated API reference of the all the pipeline options ava
signature_crossrefs: true
summary: true
<!-- ::: docling.document_converter.DocumentConverter
<!-- ::: docowling.document_converter.DocumentConverter
handler: python
options:
show_if_no_docstring: true

View File

@ -5,7 +5,7 @@
To convert individual PDF documents, use `convert()`, for example:
```python
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
@ -39,9 +39,9 @@ This can improve output quality if you find that multiple columns in extracted t
```python
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.datamodel.base_models import InputFormat
from docowling.document_converter import DocumentConverter, PdfFormatOption
from docowling.datamodel.pipeline_options import PdfPipelineOptions
pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
@ -56,9 +56,9 @@ doc_converter = DocumentConverter(
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
```python
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docowling.datamodel.base_models import InputFormat
from docowling.document_converter import DocumentConverter, PdfFormatOption
from docowling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
@ -75,10 +75,10 @@ doc_converter = DocumentConverter(
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
```python
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
# # to explicitly prefetch:
# artifacts_path = StandardPdfPipeline.download_models_hf()
@ -99,7 +99,7 @@ You can limit the file size and number of pages which should be allowed to proce
```python
from pathlib import Path
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869"
converter = DocumentConverter()
@ -112,8 +112,8 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
```python
from io import BytesIO
from docling.datamodel.base_models import DocumentStream
from docling.document_converter import DocumentConverter
from docowling.datamodel.base_models import DocumentStream
from docowling.document_converter import DocumentConverter
buf = BytesIO(your_binary_stream)
source = DocumentStream(name="my_doc.pdf", stream=buf)
@ -133,8 +133,8 @@ You can chunk a Docling document using a [chunker](concepts/chunking.md), such a
[this example](examples/hybrid_chunking.ipynb)):
```python
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from docowling.document_converter import DocumentConverter
from docowling.chunking import HybridChunker
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
doc = conv_res.document

View File

@ -46,17 +46,17 @@ Format options can include the pipeline class to use, the options to provide to
They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
```python
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
from docowling.document_converter import DocumentConverter
from docowling.datamodel.base_models import InputFormat
from docowling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docowling.pipeline.simple_pipeline import SimplePipeline
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
## Default initialization still works as before:
# doc_converter = DocumentConverter()
@ -110,7 +110,7 @@ or `DocumentStream` objects, without constructing a `DocumentConversionInput` ob
```python
...
from docling.datamodel.document import ConversionResult
from docowling.datamodel.document import ConversionResult
## Convert a single file (from URL or local path)
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`

View File

@ -118,7 +118,7 @@ ocrmac = ["ocrmac"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts]
docling = "docling.cli.main:app"
docling = "docowling.cli.main:app"
[build-system]
requires = ["poetry-core"]

View File

@ -2,9 +2,9 @@ import glob
import os
from pathlib import Path
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.backend.asciidoc_backend import AsciiDocBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
def _get_backend(fname):

View File

@ -3,12 +3,12 @@ from pathlib import Path
import pytest
from docling_core.types.doc import BoundingBox
from docling.backend.docling_parse_backend import (
from docowling.backend.docling_parse_backend import (
DoclingParseDocumentBackend,
DoclingParsePageBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
@pytest.fixture

View File

@ -2,12 +2,12 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_v2_backend import (
from docowling.backend.docling_parse_v2_backend import (
DoclingParseV2DocumentBackend,
DoclingParseV2PageBackend,
)
from docling.datamodel.base_models import BoundingBox, InputFormat
from docling.datamodel.document import InputDocument
from docowling.datamodel.base_models import BoundingBox, InputFormat
from docowling.datamodel.document import InputDocument
@pytest.fixture

View File

@ -2,14 +2,14 @@ import json
import os
from pathlib import Path
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
from docowling.backend.html_backend import HTMLDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import (
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
GENERATE = False

View File

@ -2,14 +2,14 @@ import json
import os
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
from docowling.backend.msword_backend import MsWordDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import (
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
GENERATE = False

View File

@ -2,14 +2,14 @@ import json
import os
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
from docowling.backend.msword_backend import MsWordDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import (
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
GENERATE = False

View File

@ -1,4 +1,4 @@
"""Test methods in module docling.backend.patent_uspto_backend.py."""
"""Test methods in module docowling.backend.patent_uspto_backend.py."""
import json
import logging
@ -12,14 +12,14 @@ import yaml
from docling_core.types import DoclingDocument
from docling_core.types.doc import DocItemLabel, TableData, TextItem
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import (
ConversionResult,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from docowling.document_converter import DocumentConverter
GENERATE: bool = True
DATA_PATH: Path = Path("./tests/data/uspto/")

View File

@ -3,12 +3,12 @@ from pathlib import Path
import pytest
from docling_core.types.doc import BoundingBox
from docling.backend.pypdfium2_backend import (
from docowling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,
PyPdfiumPageBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import InputDocument
@pytest.fixture

View File

@ -2,9 +2,9 @@ import json
import os
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import ConversionResult
from docowling.document_converter import DocumentConverter
GENERATE = False

View File

@ -6,9 +6,9 @@ from pathlib import Path
from docling_core.types.doc import DoclingDocument
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from docowling.datamodel.base_models import DocumentStream, InputFormat
from docowling.datamodel.document import ConversionResult
from docowling.document_converter import DocumentConverter
GENERATE = False

View File

@ -2,7 +2,7 @@ from pathlib import Path
from typer.testing import CliRunner
from docling.cli.main import app
from docowling.cli.main import app
runner = CliRunner()

View File

@ -1,10 +1,10 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -2,10 +2,10 @@ import sys
from pathlib import Path
from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
OcrOptions,
@ -14,7 +14,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -1,9 +1,9 @@
from io import BytesIO
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docowling.datamodel.base_models import DocumentStream, InputFormat
from docowling.datamodel.document import InputDocument, _DocumentConversionInput
def test_in_doc_from_valid_path():
@ -40,7 +40,7 @@ def test_in_doc_from_invalid_buf():
def test_guess_format(tmp_path):
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
"""Test docowling.datamodel.document._DocumentConversionInput.__guess_format"""
dci = _DocumentConversionInput(path_or_stream_iterator=[])
temp_dir = tmp_path / "test_guess_format"
temp_dir.mkdir()

View File

@ -3,10 +3,10 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docowling.datamodel.base_models import DocumentStream, InputFormat
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

View File

@ -3,8 +3,8 @@ from pathlib import Path
import pytest
from docling.datamodel.base_models import ConversionStatus, DocumentStream
from docling.document_converter import ConversionError, DocumentConverter
from docowling.datamodel.base_models import ConversionStatus, DocumentStream
from docowling.document_converter import ConversionError, DocumentConverter
def get_pdf_path():

View File

@ -3,9 +3,9 @@ from pathlib import Path
import pytest
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.datamodel.base_models import InputFormat
from docowling.datamodel.pipeline_options import PdfPipelineOptions
from docowling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture

View File

@ -3,16 +3,16 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docowling.datamodel.base_models import ConversionStatus, InputFormat
from docowling.datamodel.document import ConversionResult
from docowling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TableFormerMode,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docowling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture

View File

@ -8,8 +8,8 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
from pydantic import TypeAdapter
from pydantic.json import pydantic_encoder
from docling.datamodel.base_models import ConversionStatus, Page
from docling.datamodel.document import ConversionResult
from docowling.datamodel.base_models import ConversionStatus, Page
from docowling.datamodel.document import ConversionResult
def levenshtein(str1: str, str2: str) -> int: