create a single parquet output

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-08-30 16:24:42 +02:00
parent 3e789dfbdd
commit 6b84adebfa

View File

@ -1,3 +1,4 @@
import datetime
import logging import logging
import time import time
from pathlib import Path from pathlib import Path
@ -18,7 +19,7 @@ def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_paths = [
Path("./test/data/2206.01062.pdf"), Path("./tests/data/2206.01062.pdf"),
] ]
output_dir = Path("./scratch") output_dir = Path("./scratch")
@ -37,10 +38,13 @@ def main():
converted_docs = doc_converter.convert(input_files) converted_docs = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs: for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS: if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.") _log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
continue continue
doc_filename = doc.input.file.stem doc_filename = doc.input.file.stem
@ -73,15 +77,23 @@ def main():
}, },
} }
) )
df = pd.json_normalize(rows) success_count += 1
output_filename = output_dir / f"{doc_filename}.parquet" # Generate one parquet from all documents
df.to_parquet(output_filename) df = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()