mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
Add documentation
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
86c2a7fc1e
commit
32905ab959
21
README.md
21
README.md
@ -43,7 +43,9 @@ python examples/convert.py
|
||||
```
|
||||
The output of the above command will be written to `./scratch`.
|
||||
|
||||
### Enable or disable pipeline features
|
||||
### Adjust pipeline features
|
||||
|
||||
**Control pipeline options**
|
||||
|
||||
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`
|
||||
```python
|
||||
@ -54,6 +56,23 @@ doc_converter = DocumentConverter(
|
||||
)
|
||||
```
|
||||
|
||||
**Control table extraction options**
|
||||
|
||||
You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
|
||||
This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
|
||||
|
||||
|
||||
```python
|
||||
|
||||
pipeline_options = PipelineOptions(do_table_structure=True)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
artifacts_path=artifacts_path,
|
||||
pipeline_options=pipeline_options, # Controls if OCR is applied (ignores programmatic content)
|
||||
)
|
||||
```
|
||||
|
||||
### Impose limits on the document size
|
||||
|
||||
You can limit the file size and number of pages which should be allowed to process per document.
|
||||
|
@ -19,18 +19,6 @@ class PageAssembleModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
# self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
|
||||
|
||||
# def sanitize_text_poor(self, lines):
|
||||
# text = '\n'.join(lines)
|
||||
#
|
||||
# # treat line wraps.
|
||||
# sanitized_text = self.line_wrap_pattern.sub('', text)
|
||||
#
|
||||
# sanitized_text = sanitized_text.replace('\n', ' ')
|
||||
#
|
||||
# return sanitized_text
|
||||
|
||||
def sanitize_text(self, lines):
|
||||
if len(lines) <= 1:
|
||||
return " ".join(lines)
|
||||
|
Loading…
Reference in New Issue
Block a user