mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge from simplify-conv-api
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
d0fccb9342
@ -13,7 +13,7 @@ from docling_core.utils.file import resolve_file_source
|
|||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
@ -231,12 +231,9 @@ def convert(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Define input files
|
|
||||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_batch(input)
|
conv_results = doc_converter.convert_all(input_doc_paths)
|
||||||
|
|
||||||
output.mkdir(parents=True, exist_ok=True)
|
output.mkdir(parents=True, exist_ok=True)
|
||||||
export_documents(
|
export_documents(
|
||||||
|
@ -19,6 +19,7 @@ from docling_core.types.experimental import (
|
|||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
)
|
)
|
||||||
|
from docling_core.utils.file import resolve_file_source
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -158,8 +159,7 @@ class DocumentFormat(str, Enum):
|
|||||||
V1 = "v1"
|
V1 = "v1"
|
||||||
|
|
||||||
|
|
||||||
@deprecated("Use `ConversionResult` instead.")
|
class ConversionResult(BaseModel):
|
||||||
class ConvertedDocument(BaseModel):
|
|
||||||
input: InputDocument
|
input: InputDocument
|
||||||
|
|
||||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||||
@ -471,20 +471,16 @@ class ConvertedDocument(BaseModel):
|
|||||||
yield element, cropped_im
|
yield element, cropped_im
|
||||||
|
|
||||||
|
|
||||||
class ConversionResult(ConvertedDocument):
|
class _DocumentConversionInput(BaseModel):
|
||||||
pass
|
|
||||||
|
|
||||||
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||||
class DocumentConversionInput(BaseModel):
|
|
||||||
|
|
||||||
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
|
||||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||||
|
|
||||||
def docs(
|
def docs(
|
||||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
|
for item in self.path_or_stream_iterator:
|
||||||
for obj in self._path_or_stream_iterator:
|
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
_log.debug(
|
_log.debug(
|
||||||
@ -510,6 +506,8 @@ class DocumentConversionInput(BaseModel):
|
|||||||
limits=self.limits,
|
limits=self.limits,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||||
|
|
||||||
def _guess_format(self, obj):
|
def _guess_format(self, obj):
|
||||||
content = None
|
content = None
|
||||||
@ -545,21 +543,3 @@ class DocumentConversionInput(BaseModel):
|
|||||||
return "text/html"
|
return "text/html"
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
|
||||||
paths = [Path(p) for p in paths]
|
|
||||||
|
|
||||||
doc_input = cls(limits=limits)
|
|
||||||
doc_input._path_or_stream_iterator = paths
|
|
||||||
|
|
||||||
return doc_input
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_streams(
|
|
||||||
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
|
|
||||||
):
|
|
||||||
doc_input = cls(limits=limits)
|
|
||||||
doc_input._path_or_stream_iterator = streams
|
|
||||||
|
|
||||||
return doc_input
|
|
||||||
|
@ -75,19 +75,4 @@ class PdfPipelineOptions(PipelineOptions):
|
|||||||
Field(EasyOcrOptions(), discriminator="kind")
|
Field(EasyOcrOptions(), discriminator="kind")
|
||||||
)
|
)
|
||||||
|
|
||||||
keep_page_images: Annotated[
|
|
||||||
bool,
|
|
||||||
Field(
|
|
||||||
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
|
||||||
),
|
|
||||||
] = False # False: page images are removed in the assemble step
|
|
||||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||||
|
|
||||||
@model_validator(mode="after")
|
|
||||||
def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("ignore", DeprecationWarning)
|
|
||||||
default_scale = 1.0
|
|
||||||
if self.keep_page_images and self.images_scale is None:
|
|
||||||
self.images_scale = default_scale
|
|
||||||
return self
|
|
||||||
|
@ -1,33 +1,24 @@
|
|||||||
import logging
|
import logging
|
||||||
import tempfile
|
import sys
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterable, List, Optional, Type
|
from typing import Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import requests
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||||
from pydantic import (
|
|
||||||
AnyHttpUrl,
|
|
||||||
BaseModel,
|
|
||||||
ConfigDict,
|
|
||||||
TypeAdapter,
|
|
||||||
ValidationError,
|
|
||||||
model_validator,
|
|
||||||
)
|
|
||||||
from typing_extensions import deprecated
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
DocumentConversionInput,
|
|
||||||
InputDocument,
|
InputDocument,
|
||||||
|
_DocumentConversionInput,
|
||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import DocumentLimits, settings
|
||||||
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
@ -118,16 +109,56 @@ class DocumentConverter:
|
|||||||
Type[AbstractModelPipeline], AbstractModelPipeline
|
Type[AbstractModelPipeline], AbstractModelPipeline
|
||||||
] = {}
|
] = {}
|
||||||
|
|
||||||
@deprecated("Use convert_batch instead.")
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
def convert(
|
||||||
yield from self.convert_batch(input=input)
|
self,
|
||||||
|
source: Path | str | DocumentStream, # TODO review naming
|
||||||
|
raises_on_error: bool = True,
|
||||||
|
max_num_pages: int = sys.maxsize,
|
||||||
|
max_file_size: int = sys.maxsize,
|
||||||
|
) -> ConversionResult:
|
||||||
|
|
||||||
def convert_batch(
|
all_res = self.convert_all(
|
||||||
self, input: DocumentConversionInput, raise_on_error: bool = False
|
source=[source],
|
||||||
|
raises_on_error=raises_on_error,
|
||||||
|
max_num_pages=max_num_pages,
|
||||||
|
max_file_size=max_file_size,
|
||||||
|
)
|
||||||
|
return next(all_res)
|
||||||
|
|
||||||
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
|
def convert_all(
|
||||||
|
self,
|
||||||
|
source: Iterable[Path | str | DocumentStream], # TODO review naming
|
||||||
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||||
|
max_num_pages: int = sys.maxsize,
|
||||||
|
max_file_size: int = sys.maxsize,
|
||||||
) -> Iterable[ConversionResult]:
|
) -> Iterable[ConversionResult]:
|
||||||
|
limits = DocumentLimits(
|
||||||
|
max_num_pages=max_num_pages,
|
||||||
|
max_file_size=max_file_size,
|
||||||
|
)
|
||||||
|
conv_input = _DocumentConversionInput(
|
||||||
|
path_or_stream_iterator=source,
|
||||||
|
limit=limits,
|
||||||
|
)
|
||||||
|
conv_res_iter = self._convert(conv_input)
|
||||||
|
for conv_res in conv_res_iter:
|
||||||
|
if raises_on_error and conv_res.status not in {
|
||||||
|
ConversionStatus.SUCCESS,
|
||||||
|
ConversionStatus.PARTIAL_SUCCESS,
|
||||||
|
}:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
yield conv_res
|
||||||
|
|
||||||
|
def _convert(
|
||||||
|
self, conv_input: _DocumentConversionInput
|
||||||
|
) -> Iterable[ConversionResult]:
|
||||||
for input_batch in chunkify(
|
for input_batch in chunkify(
|
||||||
input.docs(self.format_to_options),
|
conv_input.docs(self.format_to_options),
|
||||||
settings.perf.doc_batch_size, # pass format_options
|
settings.perf.doc_batch_size, # pass format_options
|
||||||
):
|
):
|
||||||
_log.info(f"Going to convert document batch...")
|
_log.info(f"Going to convert document batch...")
|
||||||
@ -142,58 +173,6 @@ class DocumentConverter:
|
|||||||
if item is not None:
|
if item is not None:
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
def convert_single(
|
|
||||||
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
|
|
||||||
) -> ConversionResult:
|
|
||||||
"""Convert a single document.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If source is of unexpected type.
|
|
||||||
RuntimeError: If conversion fails.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ConversionResult: The conversion result object.
|
|
||||||
"""
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
try:
|
|
||||||
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
|
||||||
res = requests.get(http_url, stream=True)
|
|
||||||
res.raise_for_status()
|
|
||||||
fname = None
|
|
||||||
# try to get filename from response header
|
|
||||||
if cont_disp := res.headers.get("Content-Disposition"):
|
|
||||||
for par in cont_disp.strip().split(";"):
|
|
||||||
# currently only handling directive "filename" (not "*filename")
|
|
||||||
if (split := par.split("=")) and split[0].strip() == "filename":
|
|
||||||
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
|
||||||
break
|
|
||||||
# otherwise, use name from URL:
|
|
||||||
if fname is None:
|
|
||||||
fname = Path(http_url.path).name or self._default_download_filename
|
|
||||||
local_path = Path(temp_dir) / fname
|
|
||||||
with open(local_path, "wb") as f:
|
|
||||||
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
|
||||||
f.write(chunk)
|
|
||||||
except ValidationError:
|
|
||||||
try:
|
|
||||||
local_path = TypeAdapter(Path).validate_python(source)
|
|
||||||
except ValidationError:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unexpected file path type encountered: {type(source)}"
|
|
||||||
)
|
|
||||||
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
|
||||||
conv_res_iter = self.convert_batch(conv_inp)
|
|
||||||
conv_res: ConversionResult = next(conv_res_iter)
|
|
||||||
if conv_res.status not in {
|
|
||||||
ConversionStatus.SUCCESS,
|
|
||||||
ConversionStatus.PARTIAL_SUCCESS,
|
|
||||||
}:
|
|
||||||
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
|
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
|
||||||
fopt = self.format_to_options.get(doc.format)
|
fopt = self.format_to_options.get(doc.format)
|
||||||
|
|
||||||
|
@ -14,23 +14,26 @@ from docling_core.types import Ref
|
|||||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.experimental.document import DoclingDocument
|
from docling_core.types.experimental.document import DoclingDocument
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
from docling.datamodel.base_models import Cluster
|
from docling.datamodel.base_models import Cluster
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
|
|
||||||
|
|
||||||
class GlmModel:
|
class GlmOptions(BaseModel):
|
||||||
def __init__(self, config):
|
model_config = ConfigDict(protected_namespaces=())
|
||||||
self.config = config
|
|
||||||
self.create_legacy_output = config.get("create_legacy_output", True)
|
create_legacy_output: bool = True
|
||||||
|
model_names: str = "" # e.g. "language;term;reference"
|
||||||
|
|
||||||
|
|
||||||
|
class GlmModel:
|
||||||
|
def __init__(self, options: GlmOptions):
|
||||||
|
self.options = options
|
||||||
|
self.create_legacy_output = self.options.create_legacy_output
|
||||||
|
|
||||||
self.model_names = self.config.get(
|
|
||||||
"model_names", ""
|
|
||||||
) # "language;term;reference"
|
|
||||||
load_pretrained_nlp_models()
|
load_pretrained_nlp_models()
|
||||||
# model = init_nlp_model(model_names="language;term;reference")
|
self.model = init_nlp_model(model_names=self.options.model_names)
|
||||||
model = init_nlp_model(model_names=self.model_names)
|
|
||||||
self.model = model
|
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult
|
self, conv_res: ConversionResult
|
||||||
|
@ -2,6 +2,7 @@ import copy
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
from docling_core.types.experimental import CoordOrigin
|
from docling_core.types.experimental import CoordOrigin
|
||||||
@ -43,11 +44,8 @@ class LayoutModel(AbstractPageModel):
|
|||||||
FIGURE_LABEL = DocItemLabel.PICTURE
|
FIGURE_LABEL = DocItemLabel.PICTURE
|
||||||
FORMULA_LABEL = DocItemLabel.FORMULA
|
FORMULA_LABEL = DocItemLabel.FORMULA
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, artifacts_path: Path):
|
||||||
self.config = config
|
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
|
||||||
self.layout_predictor = LayoutPredictor(
|
|
||||||
config["artifacts_path"]
|
|
||||||
) # TODO temporary
|
|
||||||
|
|
||||||
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
|
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
|
||||||
MIN_INTERSECTION = 0.2
|
MIN_INTERSECTION = 0.2
|
||||||
|
@ -2,6 +2,8 @@ import logging
|
|||||||
import re
|
import re
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
FigureElement,
|
FigureElement,
|
||||||
@ -16,9 +18,13 @@ from docling.models.layout_model import LayoutModel
|
|||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PageAssembleOptions(BaseModel):
|
||||||
|
keep_images: bool = False
|
||||||
|
|
||||||
|
|
||||||
class PageAssembleModel(AbstractPageModel):
|
class PageAssembleModel(AbstractPageModel):
|
||||||
def __init__(self, config):
|
def __init__(self, options: PageAssembleOptions):
|
||||||
self.config = config
|
self.options = options
|
||||||
|
|
||||||
def sanitize_text(self, lines):
|
def sanitize_text(self, lines):
|
||||||
if len(lines) <= 1:
|
if len(lines) <= 1:
|
||||||
@ -147,7 +153,7 @@ class PageAssembleModel(AbstractPageModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Remove page images (can be disabled)
|
# Remove page images (can be disabled)
|
||||||
if self.config["images_scale"] is None:
|
if not self.options.keep_images:
|
||||||
page._image_cache = {}
|
page._image_cache = {}
|
||||||
|
|
||||||
# Unload backend
|
# Unload backend
|
||||||
|
@ -1,14 +1,19 @@
|
|||||||
from typing import Iterable
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.models.abstract_model import AbstractPageModel
|
from docling.models.abstract_model import AbstractPageModel
|
||||||
|
|
||||||
|
|
||||||
|
class PagePreprocessingOptions(BaseModel):
|
||||||
|
images_scale: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
class PagePreprocessingModel(AbstractPageModel):
|
class PagePreprocessingModel(AbstractPageModel):
|
||||||
def __init__(self, config):
|
def __init__(self, options: PagePreprocessingOptions):
|
||||||
self.config = config
|
self.options = options
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
@ -23,7 +28,7 @@ class PagePreprocessingModel(AbstractPageModel):
|
|||||||
scale=1.0
|
scale=1.0
|
||||||
) # puts the page image on the image cache at default scale
|
) # puts the page image on the image cache at default scale
|
||||||
|
|
||||||
images_scale = self.config["images_scale"]
|
images_scale = self.options.images_scale
|
||||||
# user requested scales
|
# user requested scales
|
||||||
if images_scale is not None:
|
if images_scale is not None:
|
||||||
page._default_image_scale = images_scale
|
page._default_image_scale = images_scale
|
||||||
|
@ -10,19 +10,21 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||||
from docling.datamodel.pipeline_options import TableFormerMode
|
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||||
from docling.models.abstract_model import AbstractPageModel
|
from docling.models.abstract_model import AbstractPageModel
|
||||||
|
|
||||||
|
|
||||||
class TableStructureModel(AbstractPageModel):
|
class TableStructureModel(AbstractPageModel):
|
||||||
def __init__(self, config):
|
def __init__(
|
||||||
self.config = config
|
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
|
||||||
self.do_cell_matching = config["do_cell_matching"]
|
):
|
||||||
self.mode = config["mode"]
|
self.options = options
|
||||||
|
self.do_cell_matching = self.options.do_cell_matching
|
||||||
|
self.mode = self.options.mode
|
||||||
|
|
||||||
self.enabled = config["enabled"]
|
self.enabled = enabled
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
artifacts_path: Path = config["artifacts_path"]
|
artifacts_path: Path = artifacts_path
|
||||||
|
|
||||||
if self.mode == TableFormerMode.ACCURATE:
|
if self.mode == TableFormerMode.ACCURATE:
|
||||||
artifacts_path = artifacts_path / "fat"
|
artifacts_path = artifacts_path / "fat"
|
||||||
|
@ -13,11 +13,14 @@ from docling.datamodel.pipeline_options import (
|
|||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.ds_glm_model import GlmModel
|
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
from docling.models.page_preprocessing_model import PagePreprocessingModel
|
from docling.models.page_preprocessing_model import (
|
||||||
|
PagePreprocessingModel,
|
||||||
|
PagePreprocessingOptions,
|
||||||
|
)
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||||
@ -32,57 +35,50 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
|
|
||||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
|
self.pipeline_options: PdfPipelineOptions
|
||||||
|
|
||||||
if not pipeline_options.artifacts_path:
|
if not pipeline_options.artifacts_path:
|
||||||
artifacts_path = self.download_models_hf()
|
artifacts_path = self.download_models_hf()
|
||||||
|
|
||||||
self.artifacts_path = Path(artifacts_path)
|
self.artifacts_path = Path(artifacts_path)
|
||||||
self.glm_model = GlmModel(
|
self.glm_model = GlmModel(
|
||||||
config={"create_legacy_output": pipeline_options.create_legacy_output}
|
options=GlmOptions(
|
||||||
|
create_legacy_output=pipeline_options.create_legacy_output
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
ocr_model: BaseOcrModel
|
if (ocr_model := self.get_ocr_model()) is None:
|
||||||
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
|
||||||
ocr_model = EasyOcrModel(
|
|
||||||
enabled=pipeline_options.do_ocr,
|
|
||||||
options=pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
|
|
||||||
ocr_model = TesseractOcrCliModel(
|
|
||||||
enabled=pipeline_options.do_ocr,
|
|
||||||
options=pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
|
||||||
ocr_model = TesseractOcrModel(
|
|
||||||
enabled=pipeline_options.do_ocr,
|
|
||||||
options=pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.model_pipe = [
|
self.model_pipe = [
|
||||||
|
# Pre-processing
|
||||||
PagePreprocessingModel(
|
PagePreprocessingModel(
|
||||||
config={"images_scale": pipeline_options.images_scale}
|
options=PagePreprocessingOptions(
|
||||||
|
images_scale=pipeline_options.images_scale
|
||||||
|
)
|
||||||
),
|
),
|
||||||
|
# OCR
|
||||||
ocr_model,
|
ocr_model,
|
||||||
|
# Layout model
|
||||||
LayoutModel(
|
LayoutModel(
|
||||||
config={
|
artifacts_path=artifacts_path
|
||||||
"artifacts_path": artifacts_path
|
/ StandardPdfModelPipeline._layout_model_path
|
||||||
/ StandardPdfModelPipeline._layout_model_path
|
|
||||||
}
|
|
||||||
),
|
),
|
||||||
|
# Table structure model
|
||||||
TableStructureModel(
|
TableStructureModel(
|
||||||
config={
|
enabled=pipeline_options.do_table_structure,
|
||||||
"artifacts_path": artifacts_path
|
artifacts_path=artifacts_path
|
||||||
/ StandardPdfModelPipeline._table_model_path,
|
/ StandardPdfModelPipeline._table_model_path,
|
||||||
"enabled": pipeline_options.do_table_structure,
|
options=pipeline_options.table_structure_options,
|
||||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
),
|
||||||
"mode": pipeline_options.table_structure_options.mode,
|
# Page assemble
|
||||||
}
|
PageAssembleModel(
|
||||||
|
options=PageAssembleOptions(
|
||||||
|
keep_images=pipeline_options.images_scale is not None
|
||||||
|
)
|
||||||
),
|
),
|
||||||
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
@ -104,6 +100,24 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
|
|
||||||
return Path(download_path)
|
return Path(download_path)
|
||||||
|
|
||||||
|
def get_ocr_model(self) -> Optional[BaseOcrModel]:
|
||||||
|
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
|
||||||
|
return EasyOcrModel(
|
||||||
|
enabled=self.pipeline_options.do_ocr,
|
||||||
|
options=self.pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
||||||
|
return TesseractOcrCliModel(
|
||||||
|
enabled=self.pipeline_options.do_ocr,
|
||||||
|
options=self.pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
|
||||||
|
return TesseractOcrModel(
|
||||||
|
enabled=self.pipeline_options.do_ocr,
|
||||||
|
options=self.pipeline_options.ocr_options,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||||
page._backend = doc._backend.load_page(page.page_no)
|
page._backend = doc._backend.load_page(page.page_no)
|
||||||
page.size = page._backend.get_size()
|
page.size = page._backend.get_size()
|
||||||
|
@ -7,7 +7,7 @@ from typing import Iterable
|
|||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -125,18 +125,19 @@ def main():
|
|||||||
|
|
||||||
doc_converter = DocumentConverter()
|
doc_converter = DocumentConverter()
|
||||||
|
|
||||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_batch(input)
|
conv_results = doc_converter.convert_all(
|
||||||
|
input_doc_paths,
|
||||||
|
raises_on_error=False, # to let conversion run through all and examine results at the end
|
||||||
|
)
|
||||||
success_count, partial_success_count, failure_count = export_documents(
|
success_count, partial_success_count, failure_count = export_documents(
|
||||||
conv_results, output_dir=Path("./scratch")
|
conv_results, output_dir=Path("./scratch")
|
||||||
)
|
)
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"Document conversion complete in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
if failure_count > 0:
|
if failure_count > 0:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
@ -5,9 +5,14 @@ from pathlib import Path
|
|||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
PdfPipelineOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -60,9 +65,7 @@ def export_documents(
|
|||||||
def main():
|
def main():
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
Path("./tests/data/2206.01062.pdf"),
|
|
||||||
]
|
|
||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
|
|
||||||
@ -147,24 +150,13 @@ def main():
|
|||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
|
|
||||||
# Define input files
|
|
||||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_batch(input)
|
conv_result = doc_converter.convert(input_doc_path)
|
||||||
success_count, failure_count = export_documents(
|
|
||||||
conv_results, output_dir=Path("./scratch")
|
|
||||||
)
|
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"Document converted in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
if failure_count > 0:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -2,13 +2,7 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||||
ConversionStatus,
|
|
||||||
FigureElement,
|
|
||||||
InputFormat,
|
|
||||||
Table,
|
|
||||||
)
|
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
|||||||
def main():
|
def main():
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
Path("./tests/data/2206.01062.pdf"),
|
|
||||||
]
|
|
||||||
output_dir = Path("./scratch")
|
output_dir = Path("./scratch")
|
||||||
|
|
||||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
|
||||||
|
|
||||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
# will destroy them for cleaning up memory.
|
# will destroy them for cleaning up memory.
|
||||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||||
@ -42,46 +32,29 @@ def main():
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_batch(input_files)
|
conv_res = doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
success_count = 0
|
|
||||||
failure_count = 0
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
for conv_res in conv_results:
|
doc_filename = conv_res.input.file.stem
|
||||||
if conv_res.status != ConversionStatus.SUCCESS:
|
|
||||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
|
||||||
failure_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
doc_filename = conv_res.input.file.stem
|
# Export page images
|
||||||
|
for page in conv_res.pages:
|
||||||
|
page_no = page.page_no + 1
|
||||||
|
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
||||||
|
with page_image_filename.open("wb") as fp:
|
||||||
|
page.image.save(fp, format="PNG")
|
||||||
|
|
||||||
# Export page images
|
# Export figures and tables
|
||||||
for page in conv_res.pages:
|
for element, image in conv_res.render_element_images(
|
||||||
page_no = page.page_no + 1
|
element_types=(FigureElement, Table)
|
||||||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
):
|
||||||
with page_image_filename.open("wb") as fp:
|
element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||||
page.image.save(fp, format="PNG")
|
with element_image_filename.open("wb") as fp:
|
||||||
|
image.save(fp, "PNG")
|
||||||
# Export figures and tables
|
|
||||||
for element, image in conv_res.render_element_images(
|
|
||||||
element_types=(FigureElement, Table)
|
|
||||||
):
|
|
||||||
element_image_filename = (
|
|
||||||
output_dir / f"{doc_filename}-element-{element.id}.png"
|
|
||||||
)
|
|
||||||
with element_image_filename.open("wb") as fp:
|
|
||||||
image.save(fp, "PNG")
|
|
||||||
|
|
||||||
success_count += 1
|
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
if failure_count > 0:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -5,8 +5,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.utils.export import generate_multimodal_pages
|
from docling.utils.export import generate_multimodal_pages
|
||||||
@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
|||||||
def main():
|
def main():
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
Path("./tests/data/2206.01062.pdf"),
|
|
||||||
]
|
|
||||||
output_dir = Path("./scratch")
|
output_dir = Path("./scratch")
|
||||||
|
|
||||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
|
||||||
|
|
||||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
# will destroy them for cleaning up memory.
|
# will destroy them for cleaning up memory.
|
||||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||||
@ -41,53 +36,45 @@ def main():
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
converted_docs = doc_converter.convert_batch(input_files)
|
conv_res = doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
success_count = 0
|
|
||||||
failure_count = 0
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
for doc in converted_docs:
|
|
||||||
if doc.status != ConversionStatus.SUCCESS:
|
|
||||||
_log.info(f"Document {doc.input.file} failed to convert.")
|
|
||||||
failure_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
for (
|
for (
|
||||||
content_text,
|
content_text,
|
||||||
content_md,
|
content_md,
|
||||||
content_dt,
|
content_dt,
|
||||||
page_cells,
|
page_cells,
|
||||||
page_segments,
|
page_segments,
|
||||||
page,
|
page,
|
||||||
) in generate_multimodal_pages(doc):
|
) in generate_multimodal_pages(conv_res):
|
||||||
|
|
||||||
dpi = page._default_image_scale * 72
|
dpi = page._default_image_scale * 72
|
||||||
|
|
||||||
rows.append(
|
rows.append(
|
||||||
{
|
{
|
||||||
"document": doc.input.file.name,
|
"document": conv_res.input.file.name,
|
||||||
"hash": doc.input.document_hash,
|
"hash": conv_res.input.document_hash,
|
||||||
"page_hash": page.page_hash,
|
"page_hash": page.page_hash,
|
||||||
"image": {
|
"image": {
|
||||||
"width": page.image.width,
|
"width": page.image.width,
|
||||||
"height": page.image.height,
|
"height": page.image.height,
|
||||||
"bytes": page.image.tobytes(),
|
"bytes": page.image.tobytes(),
|
||||||
},
|
},
|
||||||
"cells": page_cells,
|
"cells": page_cells,
|
||||||
"contents": content_text,
|
"contents": content_text,
|
||||||
"contents_md": content_md,
|
"contents_md": content_md,
|
||||||
"contents_dt": content_dt,
|
"contents_dt": content_dt,
|
||||||
"segments": page_segments,
|
"segments": page_segments,
|
||||||
"extra": {
|
"extra": {
|
||||||
"page_num": page.page_no + 1,
|
"page_num": page.page_no + 1,
|
||||||
"width_in_points": page.size.width,
|
"width_in_points": page.size.width,
|
||||||
"height_in_points": page.size.height,
|
"height_in_points": page.size.height,
|
||||||
"dpi": dpi,
|
"dpi": dpi,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
success_count += 1
|
|
||||||
|
|
||||||
# Generate one parquet from all documents
|
# Generate one parquet from all documents
|
||||||
df = pd.json_normalize(rows)
|
df = pd.json_normalize(rows)
|
||||||
@ -97,12 +84,9 @@ def main():
|
|||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(
|
||||||
|
f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
|
||||||
if failure_count > 0:
|
)
|
||||||
raise RuntimeError(
|
|
||||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
|
||||||
)
|
|
||||||
|
|
||||||
# This block demonstrates how the file can be opened with the HF datasets library
|
# This block demonstrates how the file can be opened with the HF datasets library
|
||||||
# from datasets import Dataset
|
# from datasets import Dataset
|
||||||
|
@ -4,8 +4,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -14,59 +12,39 @@ _log = logging.getLogger(__name__)
|
|||||||
def main():
|
def main():
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
Path("./tests/data/2206.01062.pdf"),
|
|
||||||
]
|
|
||||||
output_dir = Path("./scratch")
|
output_dir = Path("./scratch")
|
||||||
|
|
||||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter()
|
doc_converter = DocumentConverter()
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_batch(input_files)
|
conv_res = doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
success_count = 0
|
|
||||||
failure_count = 0
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
for conv_res in conv_results:
|
|
||||||
if conv_res.status != ConversionStatus.SUCCESS:
|
|
||||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
|
||||||
failure_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
doc_filename = conv_res.input.file.stem
|
doc_filename = conv_res.input.file.stem
|
||||||
|
|
||||||
# Export tables
|
# Export tables
|
||||||
for table_ix, table in enumerate(conv_res.legacy_output.tables):
|
for table_ix, table in enumerate(conv_res.legacy_output.tables):
|
||||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||||
print(f"## Table {table_ix}")
|
print(f"## Table {table_ix}")
|
||||||
print(table_df.to_markdown())
|
print(table_df.to_markdown())
|
||||||
|
|
||||||
# Save the table as csv
|
# Save the table as csv
|
||||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||||
table_df.to_csv(element_csv_filename)
|
table_df.to_csv(element_csv_filename)
|
||||||
|
|
||||||
# Save the table as html
|
# Save the table as html
|
||||||
element_html_filename = (
|
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||||
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||||
)
|
with element_html_filename.open("w") as fp:
|
||||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
fp.write(table.export_to_html())
|
||||||
with element_html_filename.open("w") as fp:
|
|
||||||
fp.write(table.export_to_html())
|
|
||||||
|
|
||||||
success_count += 1
|
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
if failure_count > 0:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter
|
|||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||||
converter = DocumentConverter()
|
converter = DocumentConverter()
|
||||||
result = converter.convert_single(source)
|
result = converter.convert(source)
|
||||||
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
|
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
|
||||||
# if the legacy output is needed, use this version
|
# if the legacy output is needed, use this version
|
||||||
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
|
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
|
||||||
|
@ -4,7 +4,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
|
||||||
from docling.document_converter import (
|
from docling.document_converter import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
@ -25,7 +24,6 @@ input_paths = [
|
|||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/2206.01062.pdf"),
|
||||||
# Path("tests/data/2305.03393v1-pg9-img.png"),
|
# Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
]
|
]
|
||||||
input = DocumentConversionInput.from_paths(input_paths)
|
|
||||||
|
|
||||||
## for defaults use:
|
## for defaults use:
|
||||||
# doc_converter = DocumentConverter()
|
# doc_converter = DocumentConverter()
|
||||||
@ -50,12 +48,36 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_results = doc_converter.convert_batch(input)
|
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
|
pdf=None,
|
||||||
|
docx=WordFormatOption(
|
||||||
|
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||||
|
),
|
||||||
|
formats=[
|
||||||
|
InputFormat.PDF,
|
||||||
|
# InputFormat.IMAGE,
|
||||||
|
InputFormat.DOCX,
|
||||||
|
InputFormat.HTML,
|
||||||
|
InputFormat.PPTX,
|
||||||
|
], # whitelist formats, other files are ignored.
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||||
|
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||||
|
InputFormat.DOCX: WordFormatOption(
|
||||||
|
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||||
|
),
|
||||||
|
# InputFormat.IMAGE: PdfFormatOption(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
conv_results = doc_converter.convert_all(input_paths)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
out_path = Path("./scratch")
|
out_path = Path("./scratch")
|
||||||
print(
|
print(
|
||||||
f"Document {res.input.file.name} converted with status {res.status}."
|
f"Document {res.input.file.name} converted."
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
f"\nSaved markdown output to: {str(out_path)}"
|
||||||
)
|
)
|
||||||
# print(res.experimental.export_to_markdown())
|
# print(res.experimental.export_to_markdown())
|
||||||
|
@ -48,7 +48,7 @@ def test_e2e_conversions():
|
|||||||
for pdf_path in pdf_paths:
|
for pdf_path in pdf_paths:
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||||
|
|
||||||
verify_conversion_result_v1(
|
verify_conversion_result_v1(
|
||||||
input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1
|
input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V1
|
||||||
|
@ -89,7 +89,7 @@ def test_e2e_conversions():
|
|||||||
for pdf_path in pdf_paths:
|
for pdf_path in pdf_paths:
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||||
|
|
||||||
# Save conversions
|
# Save conversions
|
||||||
# save_output(pdf_path, doc_result, None)
|
# save_output(pdf_path, doc_result, None)
|
||||||
|
@ -5,7 +5,6 @@ import pytest
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
@ -37,39 +36,24 @@ def converter():
|
|||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_convert_single(converter: DocumentConverter):
|
def test_convert_path(converter: DocumentConverter):
|
||||||
|
|
||||||
pdf_path = get_pdf_path()
|
pdf_path = get_pdf_path()
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
doc_result = converter.convert(pdf_path)
|
||||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||||
|
|
||||||
|
|
||||||
def test_batch_path(converter: DocumentConverter):
|
def test_convert_stream(converter: DocumentConverter):
|
||||||
|
|
||||||
pdf_path = get_pdf_path()
|
|
||||||
print(f"converting {pdf_path}")
|
|
||||||
|
|
||||||
conv_input = DocumentConversionInput.from_paths([pdf_path])
|
|
||||||
|
|
||||||
results = converter.convert_batch(conv_input)
|
|
||||||
for doc_result in results:
|
|
||||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
|
||||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
|
||||||
|
|
||||||
|
|
||||||
def test_batch_bytes(converter: DocumentConverter):
|
|
||||||
|
|
||||||
pdf_path = get_pdf_path()
|
pdf_path = get_pdf_path()
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
buf = BytesIO(pdf_path.open("rb").read())
|
buf = BytesIO(pdf_path.open("rb").read())
|
||||||
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
stream = DocumentStream(name=pdf_path.name, stream=buf)
|
||||||
conv_input = DocumentConversionInput.from_streams(docs)
|
|
||||||
|
|
||||||
results = converter.convert_batch(conv_input)
|
doc_result = converter.convert(stream)
|
||||||
for doc_result in results:
|
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
|
||||||
|
@ -39,6 +39,6 @@ def test_e2e_conversions(test_doc_path):
|
|||||||
for converter in get_converters_with_table_options():
|
for converter in get_converters_with_table_options():
|
||||||
print(f"converting {test_doc_path}")
|
print(f"converting {test_doc_path}")
|
||||||
|
|
||||||
doc_result: ConversionResult = converter.convert_single(test_doc_path)
|
doc_result: ConversionResult = converter.convert(test_doc_path)
|
||||||
|
|
||||||
assert doc_result.status == ConversionStatus.SUCCESS
|
assert doc_result.status == ConversionStatus.SUCCESS
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -234,8 +235,10 @@ def verify_conversion_result_v1(
|
|||||||
|
|
||||||
doc_pred_pages: List[Page] = doc_result.pages
|
doc_pred_pages: List[Page] = doc_result.pages
|
||||||
doc_pred: DsDocument = doc_result.legacy_output
|
doc_pred: DsDocument = doc_result.legacy_output
|
||||||
doc_pred_md = doc_result.render_as_markdown()
|
with warnings.catch_warnings():
|
||||||
doc_pred_dt = doc_result.render_as_doctags()
|
warnings.simplefilter("ignore", DeprecationWarning)
|
||||||
|
doc_pred_md = doc_result.render_as_markdown()
|
||||||
|
doc_pred_dt = doc_result.render_as_doctags()
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
||||||
|
Loading…
Reference in New Issue
Block a user