diff --git a/README.md b/README.md index f70c0153..b89b03d4 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,9 @@ python examples/convert.py ``` The output of the above command will be written to `./scratch`. -### Enable or disable pipeline features +### Adjust pipeline features + +**Control pipeline options** You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter` ```python @@ -54,6 +56,23 @@ doc_converter = DocumentConverter( ) ``` +**Control table extraction options** + +You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself. +This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one. + + +```python + +pipeline_options = PipelineOptions(do_table_structure=True) +pipeline_options.table_structure_options.do_cell_matching = True + +doc_converter = DocumentConverter( + artifacts_path=artifacts_path, + pipeline_options=pipeline_options, # Controls if OCR is applied (ignores programmatic content) +) +``` + ### Impose limits on the document size You can limit the file size and number of pages which should be allowed to process per document. diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 4ed0832d..2b9db544 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -19,18 +19,6 @@ class PageAssembleModel: def __init__(self, config): self.config = config - # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)') - - # def sanitize_text_poor(self, lines): - # text = '\n'.join(lines) - # - # # treat line wraps. - # sanitized_text = self.line_wrap_pattern.sub('', text) - # - # sanitized_text = sanitized_text.replace('\n', ' ') - # - # return sanitized_text - def sanitize_text(self, lines): if len(lines) <= 1: return " ".join(lines)