Merge remaining changes from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-15 10:52:16 +02:00
parent dac82ca7f2
commit fa5d972291
22 changed files with 1515 additions and 12 deletions

View File

@@ -1,139 +0,0 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable
import yaml
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
USE_V2 = True
USE_LEGACY = False
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
partial_success_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
if USE_V2:
# Export Docling document format to JSON:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.document.export_to_dict()))
# Export Docling document format to YAML:
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
# Export Docling document format to doctags:
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
fp.write(conv_res.document.export_to_document_tokens())
# Export Docling document format to markdown:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.document.export_to_markdown())
# Export Docling document format to text:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.document.export_to_markdown(strict_text=True))
if USE_LEGACY:
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.legacy.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.legacy_document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.legacy.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(
conv_res.legacy_document.export_to_markdown(strict_text=True)
)
# Export Markdown format:
with (output_dir / f"{doc_filename}.legacy.md").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.legacy.doctags.txt").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.legacy_document.export_to_doctags())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {conv_res.input.file} was partially converted with the following errors:"
)
for item in conv_res.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)
return success_count, partial_success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110.pdf"),
Path("./tests/data/redp5695.pdf"),
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter()
start_time = time.time()
conv_results = doc_converter.convert_all(
input_doc_paths,
raises_on_error=False, # to let conversion run through all and examine results at the end
)
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"Document conversion complete in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

View File

@@ -1,138 +0,0 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
###########################################################################
# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = False
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.do_dummy_picture_classifer = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
###########################################################################
start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")
## Export results
output_dir = Path("./scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
fp.write(json.dumps(conv_result.document.export_to_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_document_tokens())
if __name__ == "__main__":
main()

View File

@@ -1,61 +0,0 @@
import logging
import time
from pathlib import Path
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
output_dir = Path("./scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_res.input.file.stem
# Export page images
for page in conv_res.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
# Export figures and tables
for element, image in conv_res.render_element_images(
element_types=(FigureElement, Table)
):
element_image_filename = output_dir / f"{doc_filename}-element-{element.id}.png"
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
end_time = time.time() - start_time
_log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
if __name__ == "__main__":
main()

View File

@@ -1,108 +0,0 @@
import datetime
import logging
import time
from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
output_dir = Path("./scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
output_dir.mkdir(parents=True, exist_ok=True)
rows = []
for (
content_text,
content_md,
content_dt,
page_cells,
page_segments,
page,
) in generate_multimodal_pages(conv_res):
dpi = page._default_image_scale * 72
rows.append(
{
"document": conv_res.input.file.name,
"hash": conv_res.input.document_hash,
"page_hash": create_hash(
conv_res.input.document_hash + ":" + str(page.page_no - 1)
),
"image": {
"width": page.image.width,
"height": page.image.height,
"bytes": page.image.tobytes(),
},
"cells": page_cells,
"contents": content_text,
"contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments,
"extra": {
"page_num": page.page_no + 1,
"width_in_points": page.size.width,
"height_in_points": page.size.height,
"dpi": dpi,
},
}
)
# Generate one parquet from all documents
df = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
end_time = time.time() - start_time
_log.info(
f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
)
# This block demonstrates how the file can be opened with the HF datasets library
# from datasets import Dataset
# from PIL import Image
# multimodal_df = pd.read_parquet(output_filename)
# # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
# dataset = Dataset.from_pandas(multimodal_df)
# def transforms(examples):
# examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
# return examples
# dataset = dataset.map(transforms)
if __name__ == "__main__":
main()

View File

@@ -1,51 +0,0 @@
import logging
import time
from pathlib import Path
import pandas as pd
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("./tests/data/2206.01062.pdf")
output_dir = Path("./scratch")
doc_converter = DocumentConverter()
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.document.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
end_time = time.time() - start_time
_log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")
if __name__ == "__main__":
main()

View File

@@ -1,10 +0,0 @@
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
result = converter.convert(source)
print(
result.document.export_to_markdown()
) # output: ## Docling Technical Report [...]"
# if the legacy output is needed, use this version
# print(result.legacy_output.export_to_markdown()) # output: ## Docling Technical Report [...]"

View File

@@ -1,67 +0,0 @@
import json
import logging
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
]
## for defaults use:
# doc_converter = DocumentConverter()
## to customize use:
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
),
},
)
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:
out_path = Path("./scratch")
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}"
)
# print(res.docdocument.export_to_markdown())
# Export Docling document format to markdowndoc:
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.document.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict()))