mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
More renaming, design enrichment interface
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
051beae203
commit
304d16029a
@ -270,11 +270,12 @@ conv_input = DocumentConversionInput.from_paths(
|
|||||||
### Convert from binary PDF streams
|
### Convert from binary PDF streams
|
||||||
|
|
||||||
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
buf = BytesIO(your_binary_stream)
|
buf = BytesIO(your_binary_stream)
|
||||||
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||||
conv_input = DocumentConversionInput.from_streams(docs)
|
conv_input = DocumentConversionInput.from_streams(docs)
|
||||||
results = doc_converter.convert(conv_input)
|
results = doc_converter.convert_batch(conv_input)
|
||||||
```
|
```
|
||||||
### Limit resource usage
|
### Limit resource usage
|
||||||
|
|
||||||
|
@ -94,21 +94,21 @@ def export_documents(
|
|||||||
fname = output_dir / f"{doc_filename}.txt"
|
fname = output_dir / f"{doc_filename}.txt"
|
||||||
with fname.open("w") as fp:
|
with fname.open("w") as fp:
|
||||||
_log.info(f"writing Text output to {fname}")
|
_log.info(f"writing Text output to {fname}")
|
||||||
fp.write(conv_res.render_as_text_v1())
|
fp.write(conv_res.render_as_text())
|
||||||
|
|
||||||
# Export Markdown format:
|
# Export Markdown format:
|
||||||
if export_md:
|
if export_md:
|
||||||
fname = output_dir / f"{doc_filename}.md"
|
fname = output_dir / f"{doc_filename}.md"
|
||||||
with fname.open("w") as fp:
|
with fname.open("w") as fp:
|
||||||
_log.info(f"writing Markdown output to {fname}")
|
_log.info(f"writing Markdown output to {fname}")
|
||||||
fp.write(conv_res.render_as_markdown_v1())
|
fp.write(conv_res.render_as_markdown())
|
||||||
|
|
||||||
# Export Document Tags format:
|
# Export Document Tags format:
|
||||||
if export_doctags:
|
if export_doctags:
|
||||||
fname = output_dir / f"{doc_filename}.doctags"
|
fname = output_dir / f"{doc_filename}.doctags"
|
||||||
with fname.open("w") as fp:
|
with fname.open("w") as fp:
|
||||||
_log.info(f"writing Doc Tags output to {fname}")
|
_log.info(f"writing Doc Tags output to {fname}")
|
||||||
fp.write(conv_res.render_as_doctags_v1())
|
fp.write(conv_res.render_as_doctags())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||||
@ -236,7 +236,7 @@ def convert(
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert_batch(input)
|
||||||
|
|
||||||
output.mkdir(parents=True, exist_ok=True)
|
output.mkdir(parents=True, exist_ok=True)
|
||||||
export_documents(
|
export_documents(
|
||||||
|
@ -351,11 +351,11 @@ class ConvertedDocument(BaseModel):
|
|||||||
return ds_doc
|
return ds_doc
|
||||||
|
|
||||||
@deprecated("Use output.export_to_dict() instead.")
|
@deprecated("Use output.export_to_dict() instead.")
|
||||||
def render_as_dict_v1(self):
|
def render_as_dict(self):
|
||||||
return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
|
return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
|
||||||
@deprecated("Use output.export_to_markdown() instead.")
|
@deprecated("Use output.export_to_markdown() instead.")
|
||||||
def render_as_markdown_v1(
|
def render_as_markdown(
|
||||||
self,
|
self,
|
||||||
delim: str = "\n\n",
|
delim: str = "\n\n",
|
||||||
main_text_start: int = 0,
|
main_text_start: int = 0,
|
||||||
@ -381,7 +381,7 @@ class ConvertedDocument(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@deprecated("Use output.export_to_text() instead.")
|
@deprecated("Use output.export_to_text() instead.")
|
||||||
def render_as_text_v1(
|
def render_as_text(
|
||||||
self,
|
self,
|
||||||
delim: str = "\n\n",
|
delim: str = "\n\n",
|
||||||
main_text_start: int = 0,
|
main_text_start: int = 0,
|
||||||
@ -402,7 +402,7 @@ class ConvertedDocument(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@deprecated("Use output.export_to_document_tokens() instead.")
|
@deprecated("Use output.export_to_document_tokens() instead.")
|
||||||
def render_as_doctags_v1(
|
def render_as_doctags(
|
||||||
self,
|
self,
|
||||||
delim: str = "\n\n",
|
delim: str = "\n\n",
|
||||||
main_text_start: int = 0,
|
main_text_start: int = 0,
|
||||||
@ -501,11 +501,12 @@ class DocumentConversionInput(BaseModel):
|
|||||||
mime = filetype.guess_mime(str(obj))
|
mime = filetype.guess_mime(str(obj))
|
||||||
elif isinstance(obj, DocumentStream):
|
elif isinstance(obj, DocumentStream):
|
||||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||||
else:
|
|
||||||
1 == 1 # alert!!
|
|
||||||
if mime is None:
|
if mime is None:
|
||||||
|
# TODO improve this.
|
||||||
|
|
||||||
if obj.suffix == ".html":
|
if obj.suffix == ".html":
|
||||||
mime = "text/html"
|
mime = "text/html"
|
||||||
|
|
||||||
format = MimeTypeToFormat.get(mime)
|
format = MimeTypeToFormat.get(mime)
|
||||||
return format
|
return format
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ from pydantic import (
|
|||||||
field_validator,
|
field_validator,
|
||||||
model_validator,
|
model_validator,
|
||||||
)
|
)
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
@ -28,7 +29,7 @@ from docling.datamodel.document import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
from docling.utils.utils import chunkify
|
from docling.utils.utils import chunkify
|
||||||
@ -37,7 +38,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class FormatOption(BaseModel):
|
class FormatOption(BaseModel):
|
||||||
pipeline_cls: Type[BaseModelPipeline]
|
pipeline_cls: Type[AbstractModelPipeline]
|
||||||
pipeline_options: Optional[PipelineOptions] = None
|
pipeline_options: Optional[PipelineOptions] = None
|
||||||
backend: Type[AbstractDocumentBackend]
|
backend: Type[AbstractDocumentBackend]
|
||||||
|
|
||||||
@ -114,11 +115,17 @@ class DocumentConverter:
|
|||||||
_log.info(f"Requested format {f} will use default options.")
|
_log.info(f"Requested format {f} will use default options.")
|
||||||
self.format_to_options[f] = _format_to_default_options[f]
|
self.format_to_options[f] = _format_to_default_options[f]
|
||||||
|
|
||||||
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
|
self.initialized_pipelines: Dict[
|
||||||
{}
|
Type[AbstractModelPipeline], AbstractModelPipeline
|
||||||
)
|
] = {}
|
||||||
|
|
||||||
|
@deprecated("Use convert_batch instead.")
|
||||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
||||||
|
yield from self.convert_batch(input=input)
|
||||||
|
|
||||||
|
def convert_batch(
|
||||||
|
self, input: DocumentConversionInput, raise_on_error: bool = False
|
||||||
|
) -> Iterable[ConversionResult]:
|
||||||
|
|
||||||
for input_batch in chunkify(
|
for input_batch in chunkify(
|
||||||
input.docs(self.format_to_options),
|
input.docs(self.format_to_options),
|
||||||
@ -136,7 +143,9 @@ class DocumentConverter:
|
|||||||
if item is not None:
|
if item is not None:
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
def convert_single(
|
||||||
|
self, source: Path | AnyHttpUrl | str, raise_on_error: bool = False
|
||||||
|
) -> ConversionResult:
|
||||||
"""Convert a single document.
|
"""Convert a single document.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -177,7 +186,7 @@ class DocumentConverter:
|
|||||||
f"Unexpected file path type encountered: {type(source)}"
|
f"Unexpected file path type encountered: {type(source)}"
|
||||||
)
|
)
|
||||||
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
||||||
conv_res_iter = self.convert(conv_inp)
|
conv_res_iter = self.convert_batch(conv_inp)
|
||||||
conv_res: ConversionResult = next(conv_res_iter)
|
conv_res: ConversionResult = next(conv_res_iter)
|
||||||
if conv_res.status not in {
|
if conv_res.status not in {
|
||||||
ConversionStatus.SUCCESS,
|
ConversionStatus.SUCCESS,
|
||||||
@ -186,7 +195,7 @@ class DocumentConverter:
|
|||||||
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
|
def _get_pipeline(self, doc: InputDocument) -> Optional[AbstractModelPipeline]:
|
||||||
fopt = self.format_to_options.get(doc.format)
|
fopt = self.format_to_options.get(doc.format)
|
||||||
|
|
||||||
if fopt is None:
|
if fopt is None:
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Iterable
|
from typing import Any, Iterable
|
||||||
|
|
||||||
|
from docling_core.types.experimental import DoclingDocument, NodeItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
|
|
||||||
@ -8,3 +10,11 @@ class AbstractPageModel(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractEnrichmentModel(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def __call__(
|
||||||
|
self, doc: DoclingDocument, elements: Iterable[NodeItem]
|
||||||
|
) -> Iterable[Any]:
|
||||||
|
pass
|
||||||
|
@ -21,7 +21,7 @@ from docling.utils.utils import chunkify
|
|||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseModelPipeline(ABC):
|
class AbstractModelPipeline(ABC):
|
||||||
def __init__(self, pipeline_options: PipelineOptions):
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
self.pipeline_options = pipeline_options
|
self.pipeline_options = pipeline_options
|
||||||
self.model_pipe: List[Callable] = []
|
self.model_pipe: List[Callable] = []
|
||||||
@ -31,7 +31,7 @@ class BaseModelPipeline(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def assemble_document(
|
def _assemble_document(
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
pass
|
pass
|
||||||
@ -47,9 +47,9 @@ class BaseModelPipeline(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
class PaginatedModelPipeline(AbstractModelPipeline): # TODO this is a bad name.
|
||||||
|
|
||||||
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
for model in self.model_pipe:
|
for model in self.model_pipe:
|
||||||
page_batch = model(page_batch)
|
page_batch = model(page_batch)
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 2. Run pipeline stages
|
# 2. Run pipeline stages
|
||||||
pipeline_pages = self.apply_on_pages(init_pages)
|
pipeline_pages = self._apply_on_pages(init_pages)
|
||||||
|
|
||||||
for p in pipeline_pages: # Must exhaust!
|
for p in pipeline_pages: # Must exhaust!
|
||||||
pass
|
pass
|
||||||
@ -91,7 +91,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
|||||||
end_pb_time = time.time() - start_pb_time
|
end_pb_time = time.time() - start_pb_time
|
||||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||||
|
|
||||||
conv_res = self.assemble_document(in_doc, conv_res)
|
conv_res = self._assemble_document(in_doc, conv_res)
|
||||||
|
|
||||||
status = ConversionStatus.SUCCESS
|
status = ConversionStatus.SUCCESS
|
||||||
for page in conv_res.pages:
|
for page in conv_res.pages:
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from docling_core.types.experimental import NodeItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docling.backend.abstract_backend import (
|
||||||
AbstractDocumentBackend,
|
AbstractDocumentBackend,
|
||||||
@ -7,19 +10,19 @@ from docling.backend.abstract_backend import (
|
|||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
from docling.pipeline.base_model_pipeline import AbstractModelPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SimpleModelPipeline(BaseModelPipeline):
|
class SimpleModelPipeline(AbstractModelPipeline):
|
||||||
"""SimpleModelPipeline.
|
"""SimpleModelPipeline.
|
||||||
|
|
||||||
This class is used at the moment for formats / backends
|
This class is used at the moment for formats / backends
|
||||||
which produce straight DoclingDocument output.
|
which produce straight DoclingDocument output.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
|
|
||||||
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
@ -45,16 +48,21 @@ class SimpleModelPipeline(BaseModelPipeline):
|
|||||||
# a DoclingDocument straight.
|
# a DoclingDocument straight.
|
||||||
|
|
||||||
conv_res.output = in_doc._backend.convert()
|
conv_res.output = in_doc._backend.convert()
|
||||||
|
|
||||||
# Do other stuff with conv_res.experimental
|
# Do other stuff with conv_res.experimental
|
||||||
|
|
||||||
conv_res = self.assemble_document(in_doc, conv_res)
|
conv_res = self._assemble_document(in_doc, conv_res)
|
||||||
|
|
||||||
conv_res.status = ConversionStatus.SUCCESS
|
conv_res.status = ConversionStatus.SUCCESS
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def assemble_document(
|
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
||||||
|
# for model in self.model_pipe:
|
||||||
|
# element_batch = model(element_batch)
|
||||||
|
#
|
||||||
|
# yield from element_batch
|
||||||
|
|
||||||
|
def _assemble_document(
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
return conv_res
|
return conv_res
|
||||||
|
@ -83,6 +83,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
|
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
self.enrichment_pipe = [
|
||||||
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||||
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def download_models_hf(
|
def download_models_hf(
|
||||||
local_dir: Optional[Path] = None, force: bool = False
|
local_dir: Optional[Path] = None, force: bool = False
|
||||||
@ -104,7 +108,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
def assemble_document(
|
def _assemble_document(
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
all_elements = []
|
all_elements = []
|
||||||
|
@ -36,25 +36,25 @@ def export_documents(
|
|||||||
with (output_dir / f"{doc_filename}.legacy.json").open(
|
with (output_dir / f"{doc_filename}.legacy.json").open(
|
||||||
"w", encoding="utf-8"
|
"w", encoding="utf-8"
|
||||||
) as fp:
|
) as fp:
|
||||||
fp.write(json.dumps(conv_res.render_as_dict_v1()))
|
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||||
|
|
||||||
# Export Text format:
|
# Export Text format:
|
||||||
with (output_dir / f"{doc_filename}.legacy.txt").open(
|
with (output_dir / f"{doc_filename}.legacy.txt").open(
|
||||||
"w", encoding="utf-8"
|
"w", encoding="utf-8"
|
||||||
) as fp:
|
) as fp:
|
||||||
fp.write(conv_res.render_as_text_v1())
|
fp.write(conv_res.render_as_text())
|
||||||
|
|
||||||
# Export Markdown format:
|
# Export Markdown format:
|
||||||
with (output_dir / f"{doc_filename}.legacy.md").open(
|
with (output_dir / f"{doc_filename}.legacy.md").open(
|
||||||
"w", encoding="utf-8"
|
"w", encoding="utf-8"
|
||||||
) as fp:
|
) as fp:
|
||||||
fp.write(conv_res.render_as_markdown_v1())
|
fp.write(conv_res.render_as_markdown())
|
||||||
|
|
||||||
# Export Document Tags format:
|
# Export Document Tags format:
|
||||||
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
|
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
|
||||||
"w", encoding="utf-8"
|
"w", encoding="utf-8"
|
||||||
) as fp:
|
) as fp:
|
||||||
fp.write(conv_res.render_as_doctags_v1())
|
fp.write(conv_res.render_as_doctags())
|
||||||
|
|
||||||
if USE_V2:
|
if USE_V2:
|
||||||
# Export Docling document format to JSON (experimental):
|
# Export Docling document format to JSON (experimental):
|
||||||
@ -129,7 +129,7 @@ def main():
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert_batch(input)
|
||||||
success_count, partial_success_count, failure_count = export_documents(
|
success_count, partial_success_count, failure_count = export_documents(
|
||||||
conv_results, output_dir=Path("./scratch")
|
conv_results, output_dir=Path("./scratch")
|
||||||
)
|
)
|
||||||
|
@ -39,17 +39,17 @@ def export_documents(
|
|||||||
|
|
||||||
# Export Text format:
|
# Export Text format:
|
||||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||||
fp.write(conv_res.render_as_text_v1())
|
fp.write(conv_res.render_as_text())
|
||||||
|
|
||||||
# Export Markdown format:
|
# Export Markdown format:
|
||||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||||
fp.write(conv_res.render_as_markdown_v1())
|
fp.write(conv_res.render_as_markdown())
|
||||||
|
|
||||||
# Export Document Tags format:
|
# Export Document Tags format:
|
||||||
with (output_dir / f"{doc_filename}.doctags").open(
|
with (output_dir / f"{doc_filename}.doctags").open(
|
||||||
"w", encoding="utf-8"
|
"w", encoding="utf-8"
|
||||||
) as fp:
|
) as fp:
|
||||||
fp.write(conv_res.render_as_doctags_v1())
|
fp.write(conv_res.render_as_doctags())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||||
@ -157,7 +157,7 @@ def main():
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert_batch(input)
|
||||||
success_count, failure_count = export_documents(
|
success_count, failure_count = export_documents(
|
||||||
conv_results, output_dir=Path("./scratch")
|
conv_results, output_dir=Path("./scratch")
|
||||||
)
|
)
|
||||||
|
@ -42,7 +42,7 @@ def main():
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input_files)
|
conv_results = doc_converter.convert_batch(input_files)
|
||||||
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
failure_count = 0
|
failure_count = 0
|
||||||
|
@ -41,7 +41,7 @@ def main():
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
converted_docs = doc_converter.convert(input_files)
|
converted_docs = doc_converter.convert_batch(input_files)
|
||||||
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
failure_count = 0
|
failure_count = 0
|
||||||
|
@ -25,7 +25,7 @@ def main():
|
|||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input_files)
|
conv_results = doc_converter.convert_batch(input_files)
|
||||||
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
failure_count = 0
|
failure_count = 0
|
||||||
|
@ -50,7 +50,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert_batch(input)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
||||||
|
@ -39,11 +39,11 @@ def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
|||||||
|
|
||||||
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
||||||
with open(doctags_fn, "w") as fd:
|
with open(doctags_fn, "w") as fd:
|
||||||
fd.write(doc_result.render_as_doctags_v1())
|
fd.write(doc_result.render_as_doctags())
|
||||||
|
|
||||||
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
||||||
with open(md_fn, "w") as fd:
|
with open(md_fn, "w") as fd:
|
||||||
fd.write(doc_result.render_as_markdown_v1())
|
fd.write(doc_result.render_as_markdown())
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
|
@ -54,7 +54,7 @@ def test_batch_path(converter: DocumentConverter):
|
|||||||
|
|
||||||
conv_input = DocumentConversionInput.from_paths([pdf_path])
|
conv_input = DocumentConversionInput.from_paths([pdf_path])
|
||||||
|
|
||||||
results = converter.convert(conv_input)
|
results = converter.convert_batch(conv_input)
|
||||||
for doc_result in results:
|
for doc_result in results:
|
||||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||||
@ -69,7 +69,7 @@ def test_batch_bytes(converter: DocumentConverter):
|
|||||||
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
||||||
conv_input = DocumentConversionInput.from_streams(docs)
|
conv_input = DocumentConversionInput.from_streams(docs)
|
||||||
|
|
||||||
results = converter.convert(conv_input)
|
results = converter.convert_batch(conv_input)
|
||||||
for doc_result in results:
|
for doc_result in results:
|
||||||
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result_v1(input_path=pdf_path, doc_result=doc_result)
|
||||||
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result_v2(input_path=pdf_path, doc_result=doc_result)
|
||||||
|
@ -198,8 +198,8 @@ def verify_conversion_result_v1(
|
|||||||
|
|
||||||
doc_pred_pages: List[Page] = doc_result.pages
|
doc_pred_pages: List[Page] = doc_result.pages
|
||||||
doc_pred: DsDocument = doc_result.legacy_output
|
doc_pred: DsDocument = doc_result.legacy_output
|
||||||
doc_pred_md = doc_result.render_as_markdown_v1()
|
doc_pred_md = doc_result.render_as_markdown()
|
||||||
doc_pred_dt = doc_result.render_as_doctags_v1()
|
doc_pred_dt = doc_result.render_as_doctags()
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
||||||
|
Loading…
Reference in New Issue
Block a user