mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
This change enables users to extend the document conversion process with custom logic through plugins. - Introduced a PluginManager to handle preprocessing and postprocessing plugins in DocumentConverter. - Updated DocumentConverter to accept and initialize a list of plugins. - Implemented plugin execution within the document processing pipeline, enabling custom modifications before and after conversion. - Updated ConversionResult to include metadata about the plugins used during conversion. - Updated the CLI to accept plugin paths and load them dynamically. - Expanded documentation with examples for creating and using plugins. - Added test cases to verify plugin integration and ensure correct execution in various scenarios. Signed-off-by: Ayoub El Bouchtili <Ayoub.elbouchtili@gmail.com>
181 lines
6.2 KiB
Python
181 lines
6.2 KiB
Python
import pytest
|
|
from docling.plugins.manager import PluginManager
|
|
from docling.plugins.base import DoclingPlugin
|
|
from docling.plugins.models import PluginMetadata
|
|
from docling.datamodel.document import InputDocument, ConversionResult
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
|
|
class BasicTestPlugin(DoclingPlugin):
|
|
def __init__(self, name="TestPlugin"):
|
|
super().__init__(
|
|
name=name,
|
|
metadata=PluginMetadata(
|
|
version="0.1.0",
|
|
description="Test plugin",
|
|
author="Test Author"
|
|
)
|
|
)
|
|
|
|
def preprocess(self, input_doc: InputDocument) -> InputDocument:
|
|
return input_doc
|
|
|
|
class PreprocessOnlyPlugin(DoclingPlugin):
|
|
def __init__(self):
|
|
super().__init__(
|
|
name="PreprocessPlugin",
|
|
metadata=PluginMetadata(
|
|
version="0.1.0",
|
|
description="Preprocess only plugin",
|
|
author="Test Author"
|
|
)
|
|
)
|
|
|
|
def preprocess(self, input_doc: InputDocument) -> InputDocument:
|
|
input_doc._test_flag = True
|
|
return input_doc
|
|
|
|
class PostprocessOnlyPlugin(DoclingPlugin):
|
|
def __init__(self):
|
|
super().__init__(
|
|
name="PostprocessPlugin",
|
|
metadata=PluginMetadata(
|
|
version="0.1.0",
|
|
description="Postprocess only plugin",
|
|
author="Test Author"
|
|
)
|
|
)
|
|
|
|
def postprocess(self, result: ConversionResult) -> ConversionResult:
|
|
result._test_flag = True
|
|
return result
|
|
|
|
class ErrorPlugin(DoclingPlugin):
|
|
def __init__(self):
|
|
super().__init__(
|
|
name="ErrorPlugin",
|
|
metadata=PluginMetadata(
|
|
version="0.1.0",
|
|
description="Error plugin",
|
|
author="Test Author"
|
|
)
|
|
)
|
|
|
|
def preprocess(self, input_doc: InputDocument) -> InputDocument:
|
|
raise ValueError("Test error")
|
|
|
|
@pytest.fixture
|
|
def plugin_manager():
|
|
return PluginManager()
|
|
|
|
@pytest.fixture
|
|
def input_document():
|
|
return InputDocument(
|
|
path_or_stream="test.pdf",
|
|
format=InputFormat.PDF,
|
|
backend=DoclingParseDocumentBackend
|
|
)
|
|
|
|
@pytest.fixture
|
|
def conversion_result(input_document):
|
|
return ConversionResult(input=input_document)
|
|
|
|
def test_plugin_name_validation(plugin_manager):
|
|
# Test empty name
|
|
with pytest.raises(ValueError, match="Plugin name cannot be empty or whitespace"):
|
|
plugin_manager._validate_plugin_name("")
|
|
|
|
# Test whitespace name
|
|
with pytest.raises(ValueError, match="Plugin name cannot be empty or whitespace"):
|
|
plugin_manager._validate_plugin_name(" ")
|
|
|
|
# Test invalid characters
|
|
with pytest.raises(ValueError, match="Plugin name must start with a letter"):
|
|
plugin_manager._validate_plugin_name("123plugin")
|
|
|
|
# Test valid names
|
|
plugin_manager._validate_plugin_name("validPlugin123")
|
|
plugin_manager._validate_plugin_name("valid_plugin")
|
|
plugin_manager._validate_plugin_name("valid-plugin")
|
|
|
|
def test_plugin_validation(plugin_manager):
|
|
# Test None plugin
|
|
with pytest.raises(ValueError, match="Plugin cannot be None"):
|
|
plugin_manager.register_plugin(None)
|
|
|
|
# Test invalid plugin type
|
|
with pytest.raises(ValueError, match="Plugin must be an instance of DoclingPlugin"):
|
|
plugin_manager.register_plugin("not a plugin")
|
|
|
|
# Test duplicate plugin name
|
|
plugin = BasicTestPlugin()
|
|
plugin_manager.register_plugin(plugin)
|
|
with pytest.raises(ValueError, match="already registered"):
|
|
plugin_manager.register_plugin(BasicTestPlugin())
|
|
|
|
# Test invalid metadata
|
|
invalid_plugin = BasicTestPlugin(name="InvalidMetadataPlugin")
|
|
invalid_plugin.metadata.version = "invalid" # Invalid semver
|
|
with pytest.raises(ValueError, match="Invalid metadata"):
|
|
plugin_manager.register_plugin(invalid_plugin)
|
|
|
|
def test_preprocess_execution(plugin_manager, input_document):
|
|
plugin = PreprocessOnlyPlugin()
|
|
plugin_manager.register_plugin(plugin)
|
|
|
|
processed_doc = plugin_manager.execute_preprocessors(input_document)
|
|
|
|
assert hasattr(processed_doc, '_test_flag')
|
|
assert processed_doc._test_flag is True
|
|
|
|
def test_postprocess_execution(plugin_manager, conversion_result):
|
|
plugin = PostprocessOnlyPlugin()
|
|
plugin_manager.register_plugin(plugin)
|
|
|
|
processed_result = plugin_manager.execute_postprocessors(conversion_result)
|
|
|
|
assert hasattr(processed_result, '_test_flag')
|
|
assert processed_result._test_flag is True
|
|
|
|
def test_plugin_execution_error_handling(plugin_manager, input_document):
|
|
plugin = ErrorPlugin()
|
|
plugin_manager.register_plugin(plugin)
|
|
|
|
with pytest.raises(RuntimeError, match="Error in preprocessor ErrorPlugin"):
|
|
plugin_manager.execute_preprocessors(input_document)
|
|
|
|
def test_none_input_validation(plugin_manager):
|
|
with pytest.raises(ValueError, match="Input document cannot be None"):
|
|
plugin_manager.execute_preprocessors(None)
|
|
|
|
with pytest.raises(ValueError, match="Conversion result cannot be None"):
|
|
plugin_manager.execute_postprocessors(None)
|
|
|
|
def test_multiple_plugins_execution_order(plugin_manager, input_document):
|
|
class OrderTestPlugin(DoclingPlugin):
|
|
def __init__(self, name, order_list):
|
|
super().__init__(
|
|
name=name,
|
|
metadata=PluginMetadata(
|
|
version="0.1.0",
|
|
description="Order test plugin",
|
|
author="Test Author"
|
|
)
|
|
)
|
|
self.order_list = order_list
|
|
|
|
def preprocess(self, input_doc: InputDocument) -> InputDocument:
|
|
self.order_list.append(self.name)
|
|
return input_doc
|
|
|
|
execution_order = []
|
|
plugin1 = OrderTestPlugin("Plugin1", execution_order)
|
|
plugin2 = OrderTestPlugin("Plugin2", execution_order)
|
|
|
|
plugin_manager.register_plugin(plugin1)
|
|
plugin_manager.register_plugin(plugin2)
|
|
|
|
plugin_manager.execute_preprocessors(input_document)
|
|
|
|
assert execution_order == ["Plugin1", "Plugin2"]
|