mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
Merge remote-tracking branch 'origin/dev/add-r2l-tests' into multiple-updates
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
commit
fce6bb14db
2
.github/workflows/checks.yml
vendored
2
.github/workflows/checks.yml
vendored
@ -28,7 +28,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
for file in docs/examples/*.py; do
|
for file in docs/examples/*.py; do
|
||||||
# Skip batch_convert.py
|
# Skip batch_convert.py
|
||||||
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then
|
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then
|
||||||
echo "Skipping $file"
|
echo "Skipping $file"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
@ -11,7 +11,7 @@ from PIL import Image, ImageDraw
|
|||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell, Size
|
from docling.datamodel.base_models import Cell, Size, TextDirection
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@ -97,6 +97,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
y0 = cell_data[cells_header.index("y0")]
|
y0 = cell_data[cells_header.index("y0")]
|
||||||
x1 = cell_data[cells_header.index("x1")]
|
x1 = cell_data[cells_header.index("x1")]
|
||||||
y1 = cell_data[cells_header.index("y1")]
|
y1 = cell_data[cells_header.index("y1")]
|
||||||
|
ltr = cell_data[cells_header.index("left_to_right")]
|
||||||
|
|
||||||
if x1 < x0:
|
if x1 < x0:
|
||||||
x0, x1 = x1, x0
|
x0, x1 = x1, x0
|
||||||
@ -116,6 +117,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
t=y1 * page_size.height / parser_height,
|
t=y1 * page_size.height / parser_height,
|
||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
).to_top_left_origin(page_size.height),
|
).to_top_left_origin(page_size.height),
|
||||||
|
text_direction=(
|
||||||
|
TextDirection.LEFT_TO_RIGHT
|
||||||
|
if ltr
|
||||||
|
else TextDirection.RIGHT_TO_LEFT
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
cell_counter += 1
|
cell_counter += 1
|
||||||
|
@ -120,10 +120,16 @@ class ErrorItem(BaseModel):
|
|||||||
error_message: str
|
error_message: str
|
||||||
|
|
||||||
|
|
||||||
|
class TextDirection(str, Enum):
|
||||||
|
LEFT_TO_RIGHT = "left_to_right"
|
||||||
|
RIGHT_TO_LEFT = "right_to_left"
|
||||||
|
|
||||||
|
|
||||||
class Cell(BaseModel):
|
class Cell(BaseModel):
|
||||||
id: int
|
id: int
|
||||||
text: str
|
text: str
|
||||||
bbox: BoundingBox
|
bbox: BoundingBox
|
||||||
|
text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT
|
||||||
|
|
||||||
|
|
||||||
class OcrCell(Cell):
|
class OcrCell(Cell):
|
||||||
|
@ -264,6 +264,7 @@ class GlmModel:
|
|||||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||||
|
|
||||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||||
|
1 == 1
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
||||||
|
@ -11,6 +11,7 @@ from docling.datamodel.base_models import (
|
|||||||
Page,
|
Page,
|
||||||
PageElement,
|
PageElement,
|
||||||
Table,
|
Table,
|
||||||
|
TextDirection,
|
||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -75,12 +76,23 @@ class PageAssembleModel(BasePageModel):
|
|||||||
for cluster in page.predictions.layout.clusters:
|
for cluster in page.predictions.layout.clusters:
|
||||||
# _log.info("Cluster label seen:", cluster.label)
|
# _log.info("Cluster label seen:", cluster.label)
|
||||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||||
|
textlines = []
|
||||||
|
|
||||||
|
dominant_text_direction = TextDirection.LEFT_TO_RIGHT
|
||||||
|
|
||||||
|
# Naive code: dominant text direction == direction of first cell.
|
||||||
|
for cell in cluster.cells:
|
||||||
|
dominant_text_direction = cell.text_direction
|
||||||
|
break
|
||||||
|
|
||||||
|
for cell in cluster.cells:
|
||||||
|
text = cell.text.replace("\x02", "-").strip()
|
||||||
|
if text:
|
||||||
|
# if dominant_text_direction == TextDirection.RIGHT_TO_LEFT:
|
||||||
|
# textlines.insert(0, text) # Prepend RTL text
|
||||||
|
# else:
|
||||||
|
textlines.append(text) # Append LTR text
|
||||||
|
|
||||||
textlines = [
|
|
||||||
cell.text.replace("\x02", "-").strip()
|
|
||||||
for cell in cluster.cells
|
|
||||||
if len(cell.text.strip()) > 0
|
|
||||||
]
|
|
||||||
text = self.sanitize_text(textlines)
|
text = self.sanitize_text(textlines)
|
||||||
text_el = TextElement(
|
text_el = TextElement(
|
||||||
label=cluster.label,
|
label=cluster.label,
|
||||||
|
@ -49,6 +49,8 @@ def export_documents(
|
|||||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||||
fp.write(conv_res.document.export_to_markdown())
|
fp.write(conv_res.document.export_to_markdown())
|
||||||
|
|
||||||
|
conv_res.document.save_as_html(output_dir / f"{doc_filename}.html")
|
||||||
|
|
||||||
# Export Docling document format to text:
|
# Export Docling document format to text:
|
||||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
||||||
@ -103,10 +105,13 @@ def main():
|
|||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_paths = [
|
||||||
Path("./tests/data/2206.01062.pdf"),
|
Path("./tests/data/pdf/right_to_left_01.pdf"),
|
||||||
Path("./tests/data/2203.01017v2.pdf"),
|
Path("./tests/data/pdf/right_to_left_02.pdf"),
|
||||||
Path("./tests/data/2305.03393v1.pdf"),
|
Path("./tests/data/pdf/right_to_left_03.pdf"),
|
||||||
Path("./tests/data/redp5110_sampled.pdf"),
|
Path("./tests/data/pdf/2206.01062.pdf"),
|
||||||
|
Path("./tests/data/pdf/2203.01017v2.pdf"),
|
||||||
|
Path("./tests/data/pdf/2305.03393v1.pdf"),
|
||||||
|
Path("./tests/data/pdf/redp5110_sampled.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||||
|
58
docs/examples/rapidocr_with_custom_models.py
Normal file
58
docs/examples/rapidocr_with_custom_models.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
|
||||||
|
from docling.document_converter import (
|
||||||
|
ConversionResult,
|
||||||
|
DocumentConverter,
|
||||||
|
InputFormat,
|
||||||
|
PdfFormatOption,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Source document to convert
|
||||||
|
source = "https://arxiv.org/pdf/2408.09869v4"
|
||||||
|
|
||||||
|
# Download RappidOCR models from HuggingFace
|
||||||
|
print("Downloading RapidOCR models")
|
||||||
|
download_path = snapshot_download(repo_id="SWHL/RapidOCR")
|
||||||
|
|
||||||
|
# Setup RapidOcrOptions for english detection
|
||||||
|
det_model_path = os.path.join(
|
||||||
|
download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"
|
||||||
|
)
|
||||||
|
rec_model_path = os.path.join(
|
||||||
|
download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"
|
||||||
|
)
|
||||||
|
cls_model_path = os.path.join(
|
||||||
|
download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
|
||||||
|
)
|
||||||
|
ocr_options = RapidOcrOptions(
|
||||||
|
det_model_path=det_model_path,
|
||||||
|
rec_model_path=rec_model_path,
|
||||||
|
cls_model_path=cls_model_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions(
|
||||||
|
ocr_options=ocr_options,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert the document
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
conversion_result: ConversionResult = converter.convert(source=source)
|
||||||
|
doc = conversion_result.document
|
||||||
|
md = doc.export_to_markdown()
|
||||||
|
print(md)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -77,6 +77,7 @@ nav:
|
|||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
- "Force full page OCR": examples/full_page_ocr.py
|
- "Force full page OCR": examples/full_page_ocr.py
|
||||||
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
||||||
|
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
|
||||||
- "Accelerator options": examples/run_with_accelerator.py
|
- "Accelerator options": examples/run_with_accelerator.py
|
||||||
- "Simple translation": examples/translate.py
|
- "Simple translation": examples/translate.py
|
||||||
- examples/backend_xml_rag.ipynb
|
- examples/backend_xml_rag.ipynb
|
||||||
|
48
poetry.lock
generated
48
poetry.lock
generated
@ -927,45 +927,23 @@ name = "docling-parse"
|
|||||||
version = "3.2.0"
|
version = "3.2.0"
|
||||||
description = "Simple package to extract text with coordinates from programmatic PDFs"
|
description = "Simple package to extract text with coordinates from programmatic PDFs"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "^3.9"
|
||||||
files = [
|
files = []
|
||||||
{file = "docling_parse-3.2.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:16a43ab04f13f93602baf2b35dd96d4f2ac646732270e329d00ad6e1c714b013"},
|
develop = false
|
||||||
{file = "docling_parse-3.2.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:5fd77a9bb0cee57e9bee9de2f417c83ed50e49145c5ddb736801b68e004471e1"},
|
|
||||||
{file = "docling_parse-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87dc9071aa9fabd586c36599d983cda55f9eab0ba7618009f52f8f537598a70c"},
|
|
||||||
{file = "docling_parse-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5dc044ebad99808dd7a82fa2cf5beaf90f744522225bf6dce62200c810817c31"},
|
|
||||||
{file = "docling_parse-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:e5048626ca165b24a35f417a58d3c4b3f936bc9d2271fc633153a98e788548cf"},
|
|
||||||
{file = "docling_parse-3.2.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:f3616a3fb3dacc87307b6794dc81e6aec09f59afbb42f487b0100f39571e7442"},
|
|
||||||
{file = "docling_parse-3.2.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:9df9d892d8206cc2a0091bb1e2e3ce6f3ab881342980c53bc7f4bf7c831bf057"},
|
|
||||||
{file = "docling_parse-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e90e9c2b21e732b3e1af697abb5e7bca9c23a3be8e25a6cc4d92221d526953c"},
|
|
||||||
{file = "docling_parse-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71595399283ecacb255cf1768d1840177f7c33aedd58195530505f9aa8cd5a24"},
|
|
||||||
{file = "docling_parse-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0f2a7801d6fa843cd3243aab38cd944e2f4ff386fb5e6fb7b27be1dfa69845c7"},
|
|
||||||
{file = "docling_parse-3.2.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:a83458692c607a393b2f5858d47509342274920553775db5c8d0072ad6aaa0fa"},
|
|
||||||
{file = "docling_parse-3.2.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2d17bf3dffbc2fb565c5fa8347ae7715fc091f4f94228b4ece4f8ab5c3fb428a"},
|
|
||||||
{file = "docling_parse-3.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f54881ebcd87384b29f7a6475b308034a9ecba0dfa85dd1d2569ef59e2f37e97"},
|
|
||||||
{file = "docling_parse-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27de3b28946a615f16e6824bede965f8df5f3b2552e17560415922c79fa8546f"},
|
|
||||||
{file = "docling_parse-3.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:20265d7d51b3f4a3bb03e3de694f395d0403bde92915eb5df32f4d67adf93477"},
|
|
||||||
{file = "docling_parse-3.2.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:feecbef5b8593dcd9a35ceb06b29feb879515b5b900bcaa5d9be0c7a3a0ca599"},
|
|
||||||
{file = "docling_parse-3.2.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:6fc65d623e80d8d63f4a6f542408b7f88c0dc8b2842c2523858536d4607d33d5"},
|
|
||||||
{file = "docling_parse-3.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c06851f88b5edaa7115871100608b9d68bc804b28434b90e57879e178098aed2"},
|
|
||||||
{file = "docling_parse-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a2fdb3993f7affc73ce19f1202a2f28f3d8cf1716163d9e978ee9c834312c31"},
|
|
||||||
{file = "docling_parse-3.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:1b9e32989ff58e0bac85d6cffb3a523dd2373d350b26d46e1f8daff0110595fa"},
|
|
||||||
{file = "docling_parse-3.2.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:ab9ee7a74585b69a1b81f34e71d77332f85e013b5d8b6477fd9bbec779f239cf"},
|
|
||||||
{file = "docling_parse-3.2.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:ecb90d4b29f179b9afbe643c943f24b9eb065c8bf0af739bee1330e75973fcea"},
|
|
||||||
{file = "docling_parse-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b7c0f7d8f7e103305a3c4bc1ae00faca9a8b16a2a3e34042cff1b9e13f922d9"},
|
|
||||||
{file = "docling_parse-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8687ed6c3a299a46646e369f0bd072286372ecede54c8916b40acc6be6347f1a"},
|
|
||||||
{file = "docling_parse-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:491589373188c2683fbaa0d366e4806258bc6bb8ab8d1d596144516662d92e90"},
|
|
||||||
{file = "docling_parse-3.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0d73b0f920bd0940ea9daefb8c9d02aad86f62422c4e320e66dae74f07ef9888"},
|
|
||||||
{file = "docling_parse-3.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:68bcf3dcd80afbbce128e7cb2d319842a6fa0b719ad61e4c6872ca32c5063a88"},
|
|
||||||
{file = "docling_parse-3.2.0.tar.gz", hash = "sha256:d45e34860a2f845d1726d87af77e8deff17e0c6fb876707a7cf390492d408c2f"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
docling-core = ">=2.14.0,<3.0.0"
|
docling-core = "^2.14.0"
|
||||||
pillow = ">=10.4.0,<11.0.0"
|
pillow = "^10.4.0"
|
||||||
pydantic = ">=2.10.5,<3.0.0"
|
pydantic = "^2.10.5"
|
||||||
pywin32 = {version = ">=305", markers = "sys_platform == \"win32\""}
|
pywin32 = {version = ">=305", markers = "sys_platform == \"win32\""}
|
||||||
tabulate = ">=0.9.0,<1.0.0"
|
tabulate = ">=0.9.0,<1.0.0"
|
||||||
|
|
||||||
|
[package.source]
|
||||||
|
type = "git"
|
||||||
|
url = "https://github.com/DS4SD/docling-parse.git"
|
||||||
|
reference = "93e281576e740345d0161ad5da1b1fff815df8e4"
|
||||||
|
resolved_reference = "93e281576e740345d0161ad5da1b1fff815df8e4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docutils"
|
name = "docutils"
|
||||||
version = "0.21.2"
|
version = "0.21.2"
|
||||||
@ -7837,4 +7815,4 @@ tesserocr = ["tesserocr"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632"
|
content-hash = "dcab39f8f6cc4a2b24e25774c147dce5eb9da775309d69a9304b72c47725b021"
|
||||||
|
@ -29,7 +29,8 @@ pydantic = "^2.0.0"
|
|||||||
docling-core = {extras = ["chunking"], version = "^2.17.0"}
|
docling-core = {extras = ["chunking"], version = "^2.17.0"}
|
||||||
docling-ibm-models = "^3.3.0"
|
docling-ibm-models = "^3.3.0"
|
||||||
deepsearch-glm = "^1.0.0"
|
deepsearch-glm = "^1.0.0"
|
||||||
docling-parse = "^3.1.0"
|
# docling-parse = "^3.1.0"
|
||||||
|
docling-parse = { git = "https://github.com/DS4SD/docling-parse.git", rev = "93e281576e740345d0161ad5da1b1fff815df8e4"}
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
pypdfium2 = "^4.30.0"
|
pypdfium2 = "^4.30.0"
|
||||||
pydantic-settings = "^2.3.0"
|
pydantic-settings = "^2.3.0"
|
||||||
|
BIN
tests/data/pdf/right_to_left_01.pdf
Normal file
BIN
tests/data/pdf/right_to_left_01.pdf
Normal file
Binary file not shown.
BIN
tests/data/pdf/right_to_left_02.pdf
Normal file
BIN
tests/data/pdf/right_to_left_02.pdf
Normal file
Binary file not shown.
BIN
tests/data/pdf/right_to_left_03.pdf
Normal file
BIN
tests/data/pdf/right_to_left_03.pdf
Normal file
Binary file not shown.
@ -20,7 +20,7 @@ def _get_backend(fname):
|
|||||||
|
|
||||||
def test_asciidocs_examples():
|
def test_asciidocs_examples():
|
||||||
|
|
||||||
fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
|
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
|
||||||
|
|
||||||
for fname in fnames:
|
for fname in fnames:
|
||||||
print(f"reading {fname}")
|
print(f"reading {fname}")
|
||||||
|
@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):
|
|||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
@ -26,8 +26,19 @@ def _get_backend(pdf_doc):
|
|||||||
return doc_backend
|
return doc_backend
|
||||||
|
|
||||||
|
|
||||||
|
def test_cell_ordering():
|
||||||
|
pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf")
|
||||||
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
for page_index in range(0, doc_backend.page_count()):
|
||||||
|
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index)
|
||||||
|
cells = list(page_backend.get_text_cells())
|
||||||
|
|
||||||
|
1 == 1
|
||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
@ -28,7 +28,7 @@ def _get_backend(pdf_doc):
|
|||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5110_sampled.pdf")
|
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
|
||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ def test_cli_version():
|
|||||||
|
|
||||||
|
|
||||||
def test_cli_convert(tmp_path):
|
def test_cli_convert(tmp_path):
|
||||||
source = "./tests/data/2305.03393v1-pg9.pdf"
|
source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
|
||||||
output = tmp_path / "out"
|
output = tmp_path / "out"
|
||||||
output.mkdir()
|
output.mkdir()
|
||||||
result = runner.invoke(app, [source, "--output", str(output)])
|
result = runner.invoke(app, [source, "--output", str(output)])
|
||||||
|
@ -36,7 +36,7 @@ def get_converter():
|
|||||||
|
|
||||||
|
|
||||||
def test_code_and_formula_conversion():
|
def test_code_and_formula_conversion():
|
||||||
pdf_path = Path("tests/data/code_and_formula.pdf")
|
pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
@ -37,7 +37,7 @@ def get_converter():
|
|||||||
|
|
||||||
|
|
||||||
def test_picture_classifier():
|
def test_picture_classifier():
|
||||||
pdf_path = Path("tests/data/picture_classification.pdf")
|
pdf_path = Path("tests/data/pdf/picture_classification.pdf")
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
@ -15,7 +15,7 @@ GENERATE_V2 = False
|
|||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path("./tests/data")
|
directory = Path("./tests/data/pdf/")
|
||||||
|
|
||||||
# List all PDF files in the directory and its subdirectories
|
# List all PDF files in the directory and its subdirectories
|
||||||
pdf_files = sorted(directory.rglob("*.pdf"))
|
pdf_files = sorted(directory.rglob("*.pdf"))
|
||||||
|
@ -9,7 +9,7 @@ from docling.datamodel.settings import DocumentLimits
|
|||||||
|
|
||||||
def test_in_doc_from_valid_path():
|
def test_in_doc_from_valid_path():
|
||||||
|
|
||||||
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
doc = _make_input_doc(test_doc_path)
|
doc = _make_input_doc(test_doc_path)
|
||||||
assert doc.valid == True
|
assert doc.valid == True
|
||||||
|
|
||||||
@ -24,7 +24,7 @@ def test_in_doc_from_invalid_path():
|
|||||||
|
|
||||||
def test_in_doc_from_valid_buf():
|
def test_in_doc_from_valid_buf():
|
||||||
|
|
||||||
buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
|
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
doc = _make_input_doc_from_stream(stream)
|
doc = _make_input_doc_from_stream(stream)
|
||||||
@ -41,7 +41,7 @@ def test_in_doc_from_invalid_buf():
|
|||||||
|
|
||||||
|
|
||||||
def test_in_doc_with_page_range():
|
def test_in_doc_with_page_range():
|
||||||
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
limits = DocumentLimits()
|
limits = DocumentLimits()
|
||||||
limits.page_range = (1, 10)
|
limits.page_range = (1, 10)
|
||||||
|
|
||||||
@ -81,10 +81,10 @@ def test_guess_format(tmp_path):
|
|||||||
temp_dir.mkdir()
|
temp_dir.mkdir()
|
||||||
|
|
||||||
# Valid PDF
|
# Valid PDF
|
||||||
buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
|
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
assert dci._guess_format(stream) == InputFormat.PDF
|
assert dci._guess_format(stream) == InputFormat.PDF
|
||||||
doc_path = Path("./tests/data/2206.01062.pdf")
|
doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
assert dci._guess_format(doc_path) == InputFormat.PDF
|
assert dci._guess_format(doc_path) == InputFormat.PDF
|
||||||
|
|
||||||
# Valid MS Office
|
# Valid MS Office
|
||||||
|
@ -15,7 +15,7 @@ GENERATE = False
|
|||||||
|
|
||||||
def get_pdf_path():
|
def get_pdf_path():
|
||||||
|
|
||||||
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
|
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
|
||||||
return pdf_path
|
return pdf_path
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ from docling.document_converter import ConversionError, DocumentConverter
|
|||||||
|
|
||||||
def get_pdf_path():
|
def get_pdf_path():
|
||||||
|
|
||||||
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
|
pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
|
||||||
return pdf_path
|
return pdf_path
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ def test_doc_paths():
|
|||||||
Path("tests/data/docx/lorem_ipsum.docx"),
|
Path("tests/data/docx/lorem_ipsum.docx"),
|
||||||
Path("tests/data/pptx/powerpoint_sample.pptx"),
|
Path("tests/data/pptx/powerpoint_sample.pptx"),
|
||||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/pdf/2206.01062.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def test_doc_path():
|
def test_doc_path():
|
||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
def get_converters_with_table_options():
|
def get_converters_with_table_options():
|
||||||
|
@ -249,7 +249,13 @@ def verify_conversion_result_v1(
|
|||||||
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
|
|
||||||
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
||||||
|
if str(input_path.parent).endswith("pdf"):
|
||||||
|
gt_subpath = (
|
||||||
|
input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name
|
||||||
|
)
|
||||||
|
|
||||||
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
||||||
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
||||||
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
||||||
@ -325,7 +331,13 @@ def verify_conversion_result_v2(
|
|||||||
doc_pred_dt = doc_result.document.export_to_document_tokens()
|
doc_pred_dt = doc_result.document.export_to_document_tokens()
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
|
|
||||||
gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name
|
||||||
|
if str(input_path.parent).endswith("pdf"):
|
||||||
|
gt_subpath = (
|
||||||
|
input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name
|
||||||
|
)
|
||||||
|
|
||||||
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
||||||
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
||||||
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
||||||
|
Loading…
Reference in New Issue
Block a user