mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
feat!: simplify conversion API (#139)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
@@ -7,7 +7,7 @@ from typing import Iterable
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -125,18 +125,19 @@ def main():
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths,
|
||||
raises_on_error=False, # to let conversion run through all and examine results at the end
|
||||
)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
_log.info(f"Document conversion complete in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
|
||||
@@ -5,7 +5,7 @@ from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
@@ -65,9 +65,7 @@ def export_documents(
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
###########################################################################
|
||||
|
||||
@@ -152,24 +150,13 @@ def main():
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
conv_result = doc_converter.convert(input_doc_path)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(f"Document converted in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -2,13 +2,7 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
InputFormat,
|
||||
Table,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
@@ -20,13 +14,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
@@ -42,46 +32,29 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input_files)
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
# Export page images
|
||||
for page in conv_res.pages:
|
||||
page_no = page.page_no + 1
|
||||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
||||
with page_image_filename.open("wb") as fp:
|
||||
page.image.save(fp, format="PNG")
|
||||
|
||||
# Export page images
|
||||
for page in conv_res.pages:
|
||||
page_no = page.page_no + 1
|
||||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
||||
with page_image_filename.open("wb") as fp:
|
||||
page.image.save(fp, format="PNG")
|
||||
|
||||
# Export figures and tables
|
||||
for element, image in conv_res.render_element_images(
|
||||
element_types=(FigureElement, Table)
|
||||
):
|
||||
element_image_filename = (
|
||||
output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||
)
|
||||
with element_image_filename.open("wb") as fp:
|
||||
image.save(fp, "PNG")
|
||||
|
||||
success_count += 1
|
||||
# Export figures and tables
|
||||
for element, image in conv_res.render_element_images(
|
||||
element_types=(FigureElement, Table)
|
||||
):
|
||||
element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||
with element_image_filename.open("wb") as fp:
|
||||
image.save(fp, "PNG")
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -5,8 +5,7 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.export import generate_multimodal_pages
|
||||
@@ -19,13 +18,9 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
@@ -41,53 +36,45 @@ def main():
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
converted_docs = doc_converter.convert_batch(input_files)
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for doc in converted_docs:
|
||||
if doc.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
rows = []
|
||||
for (
|
||||
content_text,
|
||||
content_md,
|
||||
content_dt,
|
||||
page_cells,
|
||||
page_segments,
|
||||
page,
|
||||
) in generate_multimodal_pages(doc):
|
||||
rows = []
|
||||
for (
|
||||
content_text,
|
||||
content_md,
|
||||
content_dt,
|
||||
page_cells,
|
||||
page_segments,
|
||||
page,
|
||||
) in generate_multimodal_pages(conv_res):
|
||||
|
||||
dpi = page._default_image_scale * 72
|
||||
dpi = page._default_image_scale * 72
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"document": doc.input.file.name,
|
||||
"hash": doc.input.document_hash,
|
||||
"page_hash": page.page_hash,
|
||||
"image": {
|
||||
"width": page.image.width,
|
||||
"height": page.image.height,
|
||||
"bytes": page.image.tobytes(),
|
||||
},
|
||||
"cells": page_cells,
|
||||
"contents": content_text,
|
||||
"contents_md": content_md,
|
||||
"contents_dt": content_dt,
|
||||
"segments": page_segments,
|
||||
"extra": {
|
||||
"page_num": page.page_no + 1,
|
||||
"width_in_points": page.size.width,
|
||||
"height_in_points": page.size.height,
|
||||
"dpi": dpi,
|
||||
},
|
||||
}
|
||||
)
|
||||
success_count += 1
|
||||
rows.append(
|
||||
{
|
||||
"document": conv_res.input.file.name,
|
||||
"hash": conv_res.input.document_hash,
|
||||
"page_hash": page.page_hash,
|
||||
"image": {
|
||||
"width": page.image.width,
|
||||
"height": page.image.height,
|
||||
"bytes": page.image.tobytes(),
|
||||
},
|
||||
"cells": page_cells,
|
||||
"contents": content_text,
|
||||
"contents_md": content_md,
|
||||
"contents_dt": content_dt,
|
||||
"segments": page_segments,
|
||||
"extra": {
|
||||
"page_num": page.page_no + 1,
|
||||
"width_in_points": page.size.width,
|
||||
"height_in_points": page.size.height,
|
||||
"dpi": dpi,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Generate one parquet from all documents
|
||||
df = pd.json_normalize(rows)
|
||||
@@ -97,12 +84,9 @@ def main():
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(
|
||||
f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
|
||||
)
|
||||
|
||||
# This block demonstrates how the file can be opened with the HF datasets library
|
||||
# from datasets import Dataset
|
||||
|
||||
@@ -4,8 +4,6 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -14,59 +12,39 @@ _log = logging.getLogger(__name__)
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_batch(input_files)
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export tables
|
||||
for table_ix, table in enumerate(conv_res.legacy_output.tables):
|
||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
# Export tables
|
||||
for table_ix, table in enumerate(conv_res.legacy_output.tables):
|
||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
|
||||
# Save the table as csv
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
# Save the table as csv
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
element_html_filename = (
|
||||
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
)
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html())
|
||||
|
||||
success_count += 1
|
||||
# Save the table as html
|
||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html())
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -2,7 +2,7 @@ from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert_single(source)
|
||||
result = converter.convert(source)
|
||||
print(result.output.export_to_markdown()) # output: ## Docling Technical Report [...]"
|
||||
# if the legacy output is needed, use this version
|
||||
# print(result.render_as_markdown_v1()) # output: ## Docling Technical Report [...]"
|
||||
|
||||
@@ -6,7 +6,6 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
@@ -28,7 +27,6 @@ input_paths = [
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
# Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
]
|
||||
input = DocumentConversionInput.from_paths(input_paths)
|
||||
|
||||
## for defaults use:
|
||||
# doc_converter = DocumentConverter()
|
||||
@@ -52,12 +50,12 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
||||
},
|
||||
)
|
||||
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
conv_results = doc_converter.convert_all(input_paths)
|
||||
|
||||
for res in conv_results:
|
||||
out_path = Path("./scratch")
|
||||
print(
|
||||
f"Document {res.input.file.name} converted with status {res.status}."
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
# print(res.experimental.export_to_markdown())
|
||||
|
||||
Reference in New Issue
Block a user