mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
MO-01 - Adding CSV backend support
This commit is contained in:
parent
447802b5d1
commit
4e17a51cf6
11
CITATION.cff
11
CITATION.cff
@ -2,14 +2,9 @@
|
||||
# Visit https://bit.ly/cffinit to generate yours today!
|
||||
|
||||
cff-version: 1.2.0
|
||||
title: Docling
|
||||
message: 'If you use Docling, please consider citing as below.'
|
||||
title: Dockowling
|
||||
message: 'If you use Dockowling, please consider citing as below.'
|
||||
type: software
|
||||
authors:
|
||||
- name: Docling Team
|
||||
identifiers:
|
||||
- type: url
|
||||
value: 'https://arxiv.org/abs/2408.09869'
|
||||
description: 'arXiv:2408.09869'
|
||||
repository-code: 'https://github.com/DS4SD/docling'
|
||||
- name: Docowling
|
||||
license: MIT
|
||||
|
@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
|
||||
COPY docs/examples/minimal.py /root/minimal.py
|
||||
|
||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
|
||||
RUN python -c 'from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
|
@ -1,5 +1,5 @@
|
||||
<p align="center">
|
||||
<a href="https://github.com/ds4sd/docling">
|
||||
<a href="https://github.com/mouraworks/docowling">
|
||||
<img loading="lazy" alt="Docling" src="https://github.com/mouraworks/docowling/blob/main/docs/assets/docowling.png" width="80%"/>
|
||||
</a>
|
||||
</p>
|
||||
|
@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Set, Union
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class AbstractDocumentBackend(ABC):
|
@ -16,9 +16,9 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
105
docowling/backend/csv_backend.py
Normal file
105
docowling/backend/csv_backend.py
Normal file
@ -0,0 +1,105 @@
|
||||
import csv
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Union, Dict, Tuple, List
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
TableData,
|
||||
TableCell,
|
||||
)
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[StringIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.rows = []
|
||||
try:
|
||||
# Load the CSV data
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with self.path_or_stream.open(mode="r", encoding="utf-8") as file:
|
||||
self.rows = list(csv.reader(file))
|
||||
elif isinstance(self.path_or_stream, StringIO):
|
||||
self.rows = list(csv.reader(self.path_or_stream))
|
||||
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
self.valid = False
|
||||
raise RuntimeError(
|
||||
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False # Typically, CSV files do not support pagination.
|
||||
|
||||
def unload(self):
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.CSV}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file.csv",
|
||||
mimetype="text/csv",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
|
||||
|
||||
if self.is_valid():
|
||||
doc = self._convert_csv_to_document(doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
|
||||
return doc
|
||||
|
||||
def _convert_csv_to_document(self, doc: DoclingDocument) -> DoclingDocument:
|
||||
if not self.rows:
|
||||
return doc # No data to process
|
||||
|
||||
# Create a section for the CSV data
|
||||
self.parents[0] = doc.add_group(
|
||||
parent=None,
|
||||
label=GroupLabel.SECTION,
|
||||
name="CSV Data",
|
||||
)
|
||||
|
||||
# Convert rows into table data
|
||||
num_rows = len(self.rows)
|
||||
num_cols = max(len(row) for row in self.rows)
|
||||
|
||||
table_data = TableData(
|
||||
num_rows=num_rows,
|
||||
num_cols=num_cols,
|
||||
table_cells=[],
|
||||
)
|
||||
|
||||
for row_idx, row in enumerate(self.rows):
|
||||
for col_idx, cell in enumerate(row):
|
||||
table_cell = TableCell(
|
||||
text=cell,
|
||||
row_span=1,
|
||||
col_span=1,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + 1,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + 1,
|
||||
col_header=False,
|
||||
row_header=False,
|
||||
)
|
||||
table_data.table_cells.append(table_cell)
|
||||
|
||||
doc.add_table(data=table_data, parent=self.parents[0])
|
||||
return doc
|
@ -10,9 +10,9 @@ from docling_parse.pdf_parsers import pdf_parser_v1
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docowling.datamodel.base_models import Cell
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -10,11 +10,11 @@ from docling_parse.pdf_parsers import pdf_parser_v2
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell, Size
|
||||
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docowling.datamodel.base_models import Cell, Size
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -13,9 +13,9 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -19,9 +19,9 @@ from docling_core.types.doc import (
|
||||
)
|
||||
from marko import Markdown
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -18,9 +18,9 @@ from openpyxl.cell.cell import Cell
|
||||
from openpyxl.drawing.image import Image
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -20,12 +20,12 @@ from PIL import Image
|
||||
from pptx import Presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
from docowling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -18,9 +18,9 @@ from lxml import etree
|
||||
from lxml.etree import XPath
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -6,9 +6,9 @@ from typing import Iterable, Optional, Set, Union
|
||||
from docling_core.types.doc import BoundingBox, Size
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docling.datamodel.base_models import Cell, InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docowling.datamodel.base_models import Cell, InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
@ -11,11 +11,11 @@ from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docowling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docowling.datamodel.base_models import Cell
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
6
docling/backend/xml/pubmed_backend.py → docowling/backend/xml/pubmed_backend.py
Executable file → Normal file
6
docling/backend/xml/pubmed_backend.py → docowling/backend/xml/pubmed_backend.py
Executable file → Normal file
@ -16,9 +16,9 @@ from docling_core.types.doc import (
|
||||
from lxml import etree
|
||||
from typing_extensions import TypedDict, override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -30,9 +30,9 @@ from docling_core.types.doc.document import LevelNumber
|
||||
from pydantic import NonNegativeInt
|
||||
from typing_extensions import Self, TypedDict, override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -14,18 +14,18 @@ from docling_core.types.doc import ImageRefMode
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docowling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docowling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
InputFormat,
|
||||
OutputFormat,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
EasyOcrOptions,
|
||||
@ -39,8 +39,8 @@ from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@ -15,7 +15,7 @@ from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.backend.pdf_backend import PdfPageBackend
|
||||
from docowling.backend.pdf_backend import PdfPageBackend
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
@ -39,6 +39,7 @@ class InputFormat(str, Enum):
|
||||
ASCIIDOC = "asciidoc"
|
||||
MD = "md"
|
||||
XLSX = "xlsx"
|
||||
CSV = "csv"
|
||||
XML_USPTO = "xml_uspto"
|
||||
|
||||
|
||||
@ -60,6 +61,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
InputFormat.XLSX: ["xlsx"],
|
||||
InputFormat.CSV: ["csv"],
|
||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||
}
|
||||
|
||||
@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.XLSX: [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
],
|
||||
InputFormat.CSV: ["text/csv"],
|
||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||
}
|
||||
|
@ -47,11 +47,11 @@ from docling_core.utils.legacy import docling_document_to_legacy
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
from docowling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
from docowling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
@ -62,12 +62,12 @@ from docling.datamodel.base_models import (
|
||||
MimeTypeToFormat,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling.utils.utils import create_file_hash, create_hash
|
||||
from docowling.datamodel.settings import DocumentLimits
|
||||
from docowling.utils.profiling import ProfilingItem
|
||||
from docowling.utils.utils import create_file_hash, create_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.document_converter import FormatOption
|
||||
from docowling.document_converter import FormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -7,35 +7,36 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
from docowling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docowling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docowling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docowling.backend.html_backend import HTMLDocumentBackend
|
||||
from docowling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docowling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docowling.backend.csv_backend import CsvDocumentBackend
|
||||
from docowling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docowling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docowling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
||||
from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docowling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
from docowling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
_DocumentConversionInput,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.exceptions import ConversionError
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docling.utils.utils import chunkify
|
||||
from docowling.datamodel.pipeline_options import PipelineOptions
|
||||
from docowling.datamodel.settings import DocumentLimits, settings
|
||||
from docowling.exceptions import ConversionError
|
||||
from docowling.pipeline.base_pipeline import BasePipeline
|
||||
from docowling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docowling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -58,6 +59,9 @@ class ExcelFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
||||
|
||||
class CsvFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
||||
|
||||
class WordFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
@ -109,6 +113,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
InputFormat.CSV: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||
),
|
@ -3,8 +3,8 @@ from typing import Any, Iterable
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.base_models import Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
|
||||
|
||||
class BasePageModel(ABC):
|
@ -10,11 +10,11 @@ from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import OcrOptions
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_model import BasePageModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -24,18 +24,18 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
from docowling.datamodel.base_models import (
|
||||
Cluster,
|
||||
ContainerElement,
|
||||
FigureElement,
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.glm_utils import to_docling_document
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docling.utils.utils import create_hash
|
||||
from docowling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.utils.glm_utils import to_docling_document
|
||||
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docowling.utils.utils import create_hash
|
||||
|
||||
|
||||
class GlmOptions(BaseModel):
|
@ -6,17 +6,17 @@ import numpy
|
||||
import torch
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
EasyOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_ocr_model import BaseOcrModel
|
||||
from docowling.utils.accelerator_utils import decide_device
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -9,20 +9,20 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
from docowling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
Cell,
|
||||
Cluster,
|
||||
LayoutPrediction,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_model import BasePageModel
|
||||
from docowling.utils.accelerator_utils import decide_device
|
||||
from docowling.utils.layout_postprocessor import LayoutPostprocessor
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -4,12 +4,12 @@ from typing import Iterable, Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.base_models import OcrCell, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import OcrMacOptions
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_ocr_model import BaseOcrModel
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -4,7 +4,7 @@ from typing import Iterable, List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
from docowling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ContainerElement,
|
||||
FigureElement,
|
||||
@ -13,10 +13,10 @@ from docling.datamodel.base_models import (
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.models.base_model import BasePageModel
|
||||
from docowling.models.layout_model import LayoutModel
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -4,11 +4,11 @@ from typing import Iterable, Optional
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.base_models import Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_model import BasePageModel
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
@ -4,17 +4,17 @@ from typing import Iterable
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.datamodel.base_models import OcrCell, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
RapidOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_ocr_model import BaseOcrModel
|
||||
from docowling.utils.accelerator_utils import decide_device
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -7,18 +7,18 @@ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
TableFormerMode,
|
||||
TableStructureOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_model import BasePageModel
|
||||
from docowling.utils.accelerator_utils import decide_device
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class TableStructureModel(BasePageModel):
|
@ -9,12 +9,12 @@ from typing import Iterable, Optional, Tuple
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_ocr_model import BaseOcrModel
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -3,12 +3,12 @@ from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docowling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_ocr_model import BaseOcrModel
|
||||
from docowling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -7,20 +7,20 @@ from typing import Callable, Iterable, List
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
from docowling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docowling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docowling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docling.utils.utils import chunkify
|
||||
from docowling.datamodel.document import ConversionResult, InputDocument
|
||||
from docowling.datamodel.pipeline_options import PipelineOptions
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.models.base_model import BaseEnrichmentModel
|
||||
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docowling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -1,14 +1,14 @@
|
||||
import logging
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
from docowling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docowling.datamodel.base_models import ConversionStatus
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import PipelineOptions
|
||||
from docowling.pipeline.base_pipeline import BasePipeline
|
||||
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -5,11 +5,11 @@ from typing import Optional
|
||||
|
||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docowling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docowling.datamodel.base_models import AssembledUnit, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
@ -17,22 +17,22 @@ from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.ocr_mac_model import OcrMacModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||
from docling.models.page_preprocessing_model import (
|
||||
from docowling.models.base_ocr_model import BaseOcrModel
|
||||
from docowling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docowling.models.easyocr_model import EasyOcrModel
|
||||
from docowling.models.layout_model import LayoutModel
|
||||
from docowling.models.ocr_mac_model import OcrMacModel
|
||||
from docowling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||
from docowling.models.page_preprocessing_model import (
|
||||
PagePreprocessingModel,
|
||||
PagePreprocessingOptions,
|
||||
)
|
||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
from docowling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docowling.models.table_structure_model import TableStructureModel
|
||||
from docowling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docowling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docowling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docowling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -2,7 +2,7 @@ import logging
|
||||
|
||||
import torch
|
||||
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice
|
||||
from docowling.datamodel.pipeline_options import AcceleratorDevice
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -4,8 +4,8 @@ from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||
|
||||
from docling.datamodel.base_models import OcrCell
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
from docowling.datamodel.base_models import OcrCell
|
||||
from docowling.datamodel.document import ConversionResult, Page
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
|
||||
from docling_core.types.doc import DocItemLabel, Size
|
||||
from rtree import index
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
||||
from docowling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, List
|
||||
import numpy as np
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.settings import settings
|
||||
from docowling.datamodel.settings import settings
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
|
||||
|
||||
class ProfilingScope(str, Enum):
|
@ -28,7 +28,7 @@ The `BaseChunker` base class API defines that any chunker should provide the fol
|
||||
|
||||
- If you are using the `docling` package, you can import as follows:
|
||||
```python
|
||||
from docling.chunking import HybridChunker
|
||||
from docowling.chunking import HybridChunker
|
||||
```
|
||||
- If you are only using the `docling-core` package, you must ensure to install
|
||||
the `chunking` extra, e.g.
|
||||
|
@ -6,10 +6,10 @@ from typing import Iterable
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.datamodel.base_models import ConversionStatus
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -3,13 +3,13 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.ocr_mac_model import OcrMacOptions
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.models.ocr_mac_model import OcrMacOptions
|
||||
from docowling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
from docowling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -10,11 +10,11 @@ from docling_core.types.doc import (
|
||||
PictureItem,
|
||||
)
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.models.base_model import BaseEnrichmentModel
|
||||
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
|
||||
class ExamplePictureClassifierPipelineOptions(PdfPipelineOptions):
|
||||
|
@ -4,9 +4,9 @@ from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -5,11 +5,11 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.export import generate_multimodal_pages
|
||||
from docling.utils.utils import create_hash
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.utils.export import generate_multimodal_pages
|
||||
from docowling.utils.utils import create_hash
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
@ -10,7 +10,7 @@ from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -37,7 +37,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"from docowling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"DOC_SOURCE = \"../../tests/data/md/wiki.md\"\n",
|
||||
"\n",
|
||||
@ -68,7 +68,7 @@
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"\n",
|
||||
"from docling.chunking import HybridChunker\n",
|
||||
"from docowling.chunking import HybridChunker\n",
|
||||
"\n",
|
||||
"EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
|
||||
"MAX_TOKENS = 64\n",
|
||||
@ -404,7 +404,7 @@
|
||||
" return tbl\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"db_uri = str(Path(mkdtemp()) / \"docling.db\")\n",
|
||||
"db_uri = str(Path(mkdtemp()) / \"docowling.db\")\n",
|
||||
"index = make_lancedb_index(db_uri, doc.name, chunks, embed_model)\n",
|
||||
"\n",
|
||||
"sample_query = \"invent\"\n",
|
||||
|
@ -81,8 +81,8 @@
|
||||
"from docling_core.transforms.chunker import HierarchicalChunker\n",
|
||||
"from qdrant_client import QdrantClient\n",
|
||||
"\n",
|
||||
"from docling.datamodel.base_models import InputFormat\n",
|
||||
"from docling.document_converter import DocumentConverter"
|
||||
"from docowling.datamodel.base_models import InputFormat\n",
|
||||
"from docowling.document_converter import DocumentConverter"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1,4 +1,4 @@
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||
converter = DocumentConverter()
|
||||
|
@ -110,7 +110,7 @@
|
||||
"EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
|
||||
"QUESTION = \"Which are the main AI models in Docling?\"\n",
|
||||
"TOP_K = 3\n",
|
||||
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")"
|
||||
"MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -168,7 +168,7 @@
|
||||
"from haystack.components.writers import DocumentWriter\n",
|
||||
"from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n",
|
||||
"\n",
|
||||
"from docling.chunking import HybridChunker\n",
|
||||
"from docowling.chunking import HybridChunker\n",
|
||||
"\n",
|
||||
"document_store = MilvusDocumentStore(\n",
|
||||
" connection_args={\"uri\": MILVUS_URI},\n",
|
||||
@ -329,7 +329,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from docling.chunking import DocChunk\n",
|
||||
"from docowling.chunking import DocChunk\n",
|
||||
"\n",
|
||||
"print(f\"Question:\\n{QUESTION}\\n\")\n",
|
||||
"print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n",
|
||||
|
@ -83,7 +83,7 @@
|
||||
"from langchain_core.document_loaders import BaseLoader\n",
|
||||
"from langchain_core.documents import Document as LCDocument\n",
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"from docowling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"class DoclingPDFLoader(BaseLoader):\n",
|
||||
"\n",
|
||||
|
@ -117,7 +117,7 @@
|
||||
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
|
||||
"\n",
|
||||
"EMBED_MODEL = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
|
||||
"MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")\n",
|
||||
"MILVUS_URI = str(Path(mkdtemp()) / \"docowling.db\")\n",
|
||||
"GEN_MODEL = HuggingFaceInferenceAPI(\n",
|
||||
" token=_get_env_from_colab_or_os(\"HF_TOKEN\"),\n",
|
||||
" model_name=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
|
||||
@ -182,7 +182,7 @@
|
||||
"node_parser = MarkdownNodeParser()\n",
|
||||
"\n",
|
||||
"vector_store = MilvusVectorStore(\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
|
||||
" dim=embed_dim,\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
@ -282,7 +282,7 @@
|
||||
"node_parser = DoclingNodeParser()\n",
|
||||
"\n",
|
||||
"vector_store = MilvusVectorStore(\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
|
||||
" dim=embed_dim,\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
@ -423,7 +423,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"vector_store = MilvusVectorStore(\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docling.db\"), # or set as needed\n",
|
||||
" uri=str(Path(mkdtemp()) / \"docowling.db\"), # or set as needed\n",
|
||||
" dim=embed_dim,\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
|
@ -207,8 +207,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from docling.datamodel.document import ConversionResult\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"from docowling.datamodel.document import ConversionResult\n",
|
||||
"from docowling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# Instantiate the doc converter\n",
|
||||
"doc_converter = DocumentConverter()\n",
|
||||
|
@ -5,9 +5,9 @@ from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -1,16 +1,16 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.datamodel.settings import settings
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -4,15 +4,15 @@ from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import (
|
||||
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.document_converter import (
|
||||
DocumentConverter,
|
||||
PdfFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docowling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -140,7 +140,7 @@ This is a collection of FAQ collected from the user questions on <https://github
|
||||
Setting the OCR language in Docling is done via the OCR pipeline options:
|
||||
|
||||
```py
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.ocr_options.lang = ["fr", "de", "es", "en"] # example of languages for EasyOCR
|
||||
|
@ -36,9 +36,9 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
||||
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docowling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
|
@ -3,7 +3,7 @@
|
||||
This page provides documentation for our command line tools.
|
||||
|
||||
::: mkdocs-click
|
||||
:module: docling.cli.main
|
||||
:module: docowling.cli.main
|
||||
:command: click_app
|
||||
:prog_name: docling
|
||||
:style: table
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
This is an automatic generated API reference of the main components of Docling.
|
||||
|
||||
::: docling.document_converter
|
||||
::: docowling.document_converter
|
||||
handler: python
|
||||
options:
|
||||
members:
|
||||
|
@ -8,7 +8,7 @@ can be enabled with `do_xyz = True`.
|
||||
This is an automatic generated API reference of the all the pipeline options available in Docling.
|
||||
|
||||
|
||||
::: docling.datamodel.pipeline_options
|
||||
::: docowling.datamodel.pipeline_options
|
||||
handler: python
|
||||
options:
|
||||
show_if_no_docstring: true
|
||||
@ -28,7 +28,7 @@ This is an automatic generated API reference of the all the pipeline options ava
|
||||
signature_crossrefs: true
|
||||
summary: true
|
||||
|
||||
<!-- ::: docling.document_converter.DocumentConverter
|
||||
<!-- ::: docowling.document_converter.DocumentConverter
|
||||
handler: python
|
||||
options:
|
||||
show_if_no_docstring: true
|
||||
|
@ -5,7 +5,7 @@
|
||||
To convert individual PDF documents, use `convert()`, for example:
|
||||
|
||||
```python
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
@ -39,9 +39,9 @@ This can improve output quality if you find that multiple columns in extracted t
|
||||
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
|
||||
pipeline_options = PdfPipelineOptions(do_table_structure=True)
|
||||
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
||||
@ -56,9 +56,9 @@ doc_converter = DocumentConverter(
|
||||
Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
||||
|
||||
pipeline_options = PdfPipelineOptions(do_table_structure=True)
|
||||
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model
|
||||
@ -75,10 +75,10 @@ doc_converter = DocumentConverter(
|
||||
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
# # to explicitly prefetch:
|
||||
# artifacts_path = StandardPdfPipeline.download_models_hf()
|
||||
@ -99,7 +99,7 @@ You can limit the file size and number of pages which should be allowed to proce
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869"
|
||||
converter = DocumentConverter()
|
||||
@ -112,8 +112,8 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
|
||||
|
||||
```python
|
||||
from io import BytesIO
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.datamodel.base_models import DocumentStream
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
buf = BytesIO(your_binary_stream)
|
||||
source = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||
@ -133,8 +133,8 @@ You can chunk a Docling document using a [chunker](concepts/chunking.md), such a
|
||||
[this example](examples/hybrid_chunking.ipynb)):
|
||||
|
||||
```python
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.chunking import HybridChunker
|
||||
from docowling.document_converter import DocumentConverter
|
||||
from docowling.chunking import HybridChunker
|
||||
|
||||
conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
|
||||
doc = conv_res.document
|
||||
|
16
docs/v2.md
16
docs/v2.md
@ -46,17 +46,17 @@ Format options can include the pipeline class to use, the options to provide to
|
||||
They are provided as format-specific types, such as `PdfFormatOption` or `WordFormatOption`, as seen below.
|
||||
|
||||
```python
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import (
|
||||
from docowling.document_converter import DocumentConverter
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.document_converter import (
|
||||
DocumentConverter,
|
||||
PdfFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docowling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docowling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
|
||||
## Default initialization still works as before:
|
||||
# doc_converter = DocumentConverter()
|
||||
@ -110,7 +110,7 @@ or `DocumentStream` objects, without constructing a `DocumentConversionInput` ob
|
||||
|
||||
```python
|
||||
...
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
## Convert a single file (from URL or local path)
|
||||
conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/2408.09869") # previously `convert_single`
|
||||
|
||||
|
@ -118,7 +118,7 @@ ocrmac = ["ocrmac"]
|
||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||
|
||||
[tool.poetry.scripts]
|
||||
docling = "docling.cli.main:app"
|
||||
docling = "docowling.cli.main:app"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
|
@ -2,9 +2,9 @@ import glob
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
def _get_backend(fname):
|
||||
|
@ -3,12 +3,12 @@ from pathlib import Path
|
||||
import pytest
|
||||
from docling_core.types.doc import BoundingBox
|
||||
|
||||
from docling.backend.docling_parse_backend import (
|
||||
from docowling.backend.docling_parse_backend import (
|
||||
DoclingParseDocumentBackend,
|
||||
DoclingParsePageBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -2,12 +2,12 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_v2_backend import (
|
||||
from docowling.backend.docling_parse_v2_backend import (
|
||||
DoclingParseV2DocumentBackend,
|
||||
DoclingParseV2PageBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import BoundingBox, InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.datamodel.base_models import BoundingBox, InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -2,14 +2,14 @@ import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
from docowling.backend.html_backend import HTMLDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
@ -2,14 +2,14 @@ import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
from docowling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
@ -2,14 +2,14 @@ import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
from docowling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Test methods in module docling.backend.patent_uspto_backend.py."""
|
||||
"""Test methods in module docowling.backend.patent_uspto_backend.py."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
@ -12,14 +12,14 @@ import yaml
|
||||
from docling_core.types import DoclingDocument
|
||||
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
||||
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import (
|
||||
from docowling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE: bool = True
|
||||
DATA_PATH: Path = Path("./tests/data/uspto/")
|
||||
|
@ -3,12 +3,12 @@ from pathlib import Path
|
||||
import pytest
|
||||
from docling_core.types.doc import BoundingBox
|
||||
|
||||
from docling.backend.pypdfium2_backend import (
|
||||
from docowling.backend.pypdfium2_backend import (
|
||||
PyPdfiumDocumentBackend,
|
||||
PyPdfiumPageBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -2,9 +2,9 @@ import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
@ -6,9 +6,9 @@ from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docowling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
@ -2,7 +2,7 @@ from pathlib import Path
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from docling.cli.main import app
|
||||
from docowling.cli.main import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
|
@ -2,10 +2,10 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
OcrOptions,
|
||||
@ -14,7 +14,7 @@ from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docowling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docowling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docowling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
@ -40,7 +40,7 @@ def test_in_doc_from_invalid_buf():
|
||||
|
||||
|
||||
def test_guess_format(tmp_path):
|
||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||
"""Test docowling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||
temp_dir = tmp_path / "test_guess_format"
|
||||
temp_dir.mkdir()
|
||||
|
@ -3,10 +3,10 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docowling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
|
@ -3,8 +3,8 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream
|
||||
from docling.document_converter import ConversionError, DocumentConverter
|
||||
from docowling.datamodel.base_models import ConversionStatus, DocumentStream
|
||||
from docowling.document_converter import ConversionError, DocumentConverter
|
||||
|
||||
|
||||
def get_pdf_path():
|
||||
|
@ -3,9 +3,9 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.datamodel.base_models import InputFormat
|
||||
from docowling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -3,16 +3,16 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
from docowling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docowling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
TableFormerMode,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docowling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -8,8 +8,8 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
|
||||
from pydantic import TypeAdapter
|
||||
from pydantic.json import pydantic_encoder
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docowling.datamodel.base_models import ConversionStatus, Page
|
||||
from docowling.datamodel.document import ConversionResult
|
||||
|
||||
|
||||
def levenshtein(str1: str, str2: str) -> int:
|
||||
|
Loading…
Reference in New Issue
Block a user