Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

This commit is contained in:
Michele Dolfi 2024-10-11 12:59:11 +02:00
commit 786b89efd9
12 changed files with 80 additions and 37 deletions

View File

@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
if not success: if not success:
raise RuntimeError( raise RuntimeError(
f"docling-parse could not load document {document_hash}." f"docling-parse could not load document with hash {document_hash}."
) )
def page_count(self) -> int: def page_count(self) -> int:

View File

@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
print("About to init HTML backend...") _log.debug("About to init HTML backend...")
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self.soup = None self.soup = None
# HTML file: # HTML file:
@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
text_stream = byte_stream.getvalue().decode("utf-8") text_stream = self.path_or_stream.getvalue().decode("utf-8")
print(text_stream)
self.soup = BeautifulSoup(text_stream, "html.parser") self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read() html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser") self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e: except Exception as e:
_log.error("could not parse html: {}".format(e)) raise RuntimeError(
return doc f"Could not initialize HTML backend for file with hash {document_hash}."
) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
print("Trying to convert HTML...") _log.debug("Trying to convert HTML...")
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"): for br in self.soup.body.find_all("br"):
br.replace_with("\n") br.replace_with("\n")
@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def analyse_element(self, element, idx, doc): def analyse_element(self, element, idx, doc):
""" """
if element.name!=None: if element.name!=None:
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
""" """
if element.name in self.labels: if element.name in self.labels:
@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=data, parent=self.parents[self.level]) doc.add_table(data=data, parent=self.parents[self.level])
def get_list_text(list_element, level=0): def get_list_text(self, list_element, level=0):
"""Recursively extract text from <ul> or <ol> with proper indentation.""" """Recursively extract text from <ul> or <ol> with proper indentation."""
result = [] result = []
bullet_char = "*" # Default bullet character for unordered lists bullet_char = "*" # Default bullet character for unordered lists
@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists # Handle nested lists
nested_list = li.find(["ul", "ol"]) nested_list = li.find(["ul", "ol"])
if nested_list: if nested_list:
result.extend(get_list_text(nested_list, level + 1)) result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False): for li in list_element.find_all("li", recursive=False):
# Add bullet points for unordered lists # Add bullet points for unordered lists
@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists # Handle nested lists
nested_list = li.find(["ul", "ol"]) nested_list = li.find(["ul", "ol"])
if nested_list: if nested_list:
result.extend(get_list_text(nested_list, level + 1)) result.extend(self.get_list_text(nested_list, level + 1))
return result return result

View File

@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.pptx_obj = None self.pptx_obj = None
self.valid = True self.valid = False
try: try:
self.pptx_obj = Presentation(self.path_or_stream) self.pptx_obj = Presentation(self.path_or_stream)
self.valid = True
except Exception: except Exception:
_log.error("could not parse pptx") raise RuntimeError(
self.valid = False f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
return return

View File

@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# self.initialise(path_or_stream) # self.initialise(path_or_stream)
# Word file: # Word file:
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.valid = False
# Initialise the parents for the hierarchy # Initialise the parents for the hierarchy
self.max_levels = 10 self.max_levels = 10
self.level_at_new_list = None self.level_at_new_list = None
@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"indents": [None], "indents": [None],
} }
self.docx_obj = None
try:
self.docx_obj = docx.Document(self.path_or_stream)
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model. # Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
docx_obj = None
try:
docx_obj = docx.Document(self.path_or_stream)
except Exception:
_log.error("could not parse docx")
return doc
# self.initialise() # self.initialise()
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc) doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc return doc
def update_history(self, name, level, numid, ilevel): def update_history(self, name, level, numid, ilevel):

View File

@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e: except PdfiumError as e:
raise RuntimeError( raise RuntimeError(
f"pypdfium could not load document {document_hash}" f"pypdfium could not load document with hash {document_hash}"
) from e ) from e
def page_count(self) -> int: def page_count(self) -> int:

View File

@ -1,4 +1,5 @@
import logging import logging
import re
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
@ -171,7 +172,7 @@ class ConvertedDocument(BaseModel):
pages: List[Page] = [] pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit() assembled: AssembledUnit = AssembledUnit()
legacy_output: DsDocument = _EMPTY_LEGACY_DOC legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC
output: DoclingDocument = _EMPTY_DOCLING_DOC output: DoclingDocument = _EMPTY_DOCLING_DOC
def _to_legacy_document(self) -> DsDocument: def _to_legacy_document(self) -> DsDocument:
@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
) )
def _guess_format(self, obj): def _guess_format(self, obj):
content = None
if isinstance(obj, Path): if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj)) mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream): if mime is None:
mime = filetype.guess_mime(obj.stream.read(8192)) with obj.open("rb") as f:
if mime is None: content = f.read(1024) # Read first 1KB
# TODO improve this.
if obj.suffix == ".html": elif isinstance(obj, DocumentStream):
mime = "text/html" obj.stream.seek(0)
content = obj.stream.read(8192)
obj.stream.seek(0)
mime = filetype.guess_mime(content)
if mime is None:
mime = self._detect_html_xhtml(content)
format = MimeTypeToFormat.get(mime) format = MimeTypeToFormat.get(mime)
return format return format
def _detect_html_xhtml(self, content):
content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
content_str = content_str.lstrip()
if re.match(r"<\?xml", content_str):
if "xhtml" in content_str[:1000]:
return "application/xhtml+xml"
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html"
return None
@classmethod @classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None): def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths] paths = [Path(p) for p in paths]

View File

@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
) )
class PipelineOptions(BaseModel): ... class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
class PdfPipelineOptions(PipelineOptions): class PdfPipelineOptions(PipelineOptions):

View File

@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
class GlmModel: class GlmModel:
def __init__(self, config): def __init__(self, config):
self.config = config self.config = config
self.create_legacy_output = config.get("create_legacy_output", True)
self.model_names = self.config.get( self.model_names = self.config.get(
"model_names", "" "model_names", ""
) # "language;term;reference" ) # "language;term;reference"
@ -42,7 +44,10 @@ class GlmModel:
) )
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict) legacy_doc: DsLegacyDocument = None
if self.create_legacy_output:
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
# DEBUG code: # DEBUG code:
def draw_clusters_and_cells(ds_document, page_no): def draw_clusters_and_cells(ds_document, page_no):
@ -92,4 +97,4 @@ class GlmModel:
# draw_clusters_and_cells(ds_doc, 0) # draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0) # draw_clusters_and_cells(exported_doc, 0)
return (legacy_doc, docling_doc) return (docling_doc, legacy_doc)

View File

@ -41,7 +41,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
artifacts_path = self.download_models_hf() artifacts_path = self.download_models_hf()
self.artifacts_path = Path(artifacts_path) self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel(config={}) self.glm_model = GlmModel(
config={"create_legacy_output": pipeline_options.create_legacy_output}
)
if ocr_model := self.get_ocr_model() is None: if ocr_model := self.get_ocr_model() is None:
raise RuntimeError( raise RuntimeError(
@ -140,7 +142,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
elements=all_elements, headers=all_headers, body=all_body elements=all_elements, headers=all_headers, body=all_body
) )
conv_res.legacy_output, conv_res.output = self.glm_model(conv_res) conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
return conv_res return conv_res

View File

@ -120,7 +120,7 @@ def main():
] ]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs) # input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter() doc_converter = DocumentConverter()

View File

@ -1,3 +1,4 @@
import json
import logging import logging
from pathlib import Path from pathlib import Path
@ -38,6 +39,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
InputFormat.PDF, InputFormat.PDF,
# InputFormat.IMAGE, # InputFormat.IMAGE,
InputFormat.DOCX, InputFormat.DOCX,
InputFormat.HTML,
], # whitelist formats, other files are ignored. ], # whitelist formats, other files are ignored.
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
@ -53,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
conv_results = doc_converter.convert_batch(input) conv_results = doc_converter.convert_batch(input)
for res in conv_results: for res in conv_results:
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" out_path = Path("./scratch")
print( print(
f"Document {res.input.file.name} converted with status {res.status}." f"Document {res.input.file.name} converted with status {res.status}."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {str(out_path)}"
) )
# print(res.experimental.export_to_markdown()) # print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental): # Export Docling document format to markdown (experimental):
with out_path.open("w") as fp: with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.output.export_to_markdown()) fp.write(res.output.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.output.export_to_dict()))

Binary file not shown.