mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
create a single parquet output
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
3e789dfbdd
commit
6b84adebfa
@ -1,3 +1,4 @@
|
|||||||
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -18,7 +19,7 @@ def main():
|
|||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_paths = [
|
||||||
Path("./test/data/2206.01062.pdf"),
|
Path("./tests/data/2206.01062.pdf"),
|
||||||
]
|
]
|
||||||
output_dir = Path("./scratch")
|
output_dir = Path("./scratch")
|
||||||
|
|
||||||
@ -37,10 +38,13 @@ def main():
|
|||||||
|
|
||||||
converted_docs = doc_converter.convert(input_files)
|
converted_docs = doc_converter.convert(input_files)
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
failure_count = 0
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
for doc in converted_docs:
|
for doc in converted_docs:
|
||||||
if doc.status != ConversionStatus.SUCCESS:
|
if doc.status != ConversionStatus.SUCCESS:
|
||||||
_log.info(f"Document {doc.input.file} failed to convert.")
|
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||||
|
failure_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
doc_filename = doc.input.file.stem
|
doc_filename = doc.input.file.stem
|
||||||
@ -73,15 +77,23 @@ def main():
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
df = pd.json_normalize(rows)
|
success_count += 1
|
||||||
|
|
||||||
output_filename = output_dir / f"{doc_filename}.parquet"
|
# Generate one parquet from all documents
|
||||||
df.to_parquet(output_filename)
|
df = pd.json_normalize(rows)
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||||
|
df.to_parquet(output_filename)
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
|
if failure_count > 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user