mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction
This commit is contained in:
commit
786b89efd9
@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|||||||
|
|
||||||
if not success:
|
if not success:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"docling-parse could not load document {document_hash}."
|
f"docling-parse could not load document with hash {document_hash}."
|
||||||
)
|
)
|
||||||
|
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
|
@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
print("About to init HTML backend...")
|
_log.debug("About to init HTML backend...")
|
||||||
super().__init__(path_or_stream, document_hash)
|
super().__init__(path_or_stream, document_hash)
|
||||||
self.soup = None
|
self.soup = None
|
||||||
# HTML file:
|
# HTML file:
|
||||||
@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = byte_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
print(text_stream)
|
|
||||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||||
html_content = f.read()
|
html_content = f.read()
|
||||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log.error("could not parse html: {}".format(e))
|
raise RuntimeError(
|
||||||
return doc
|
f"Could not initialize HTML backend for file with hash {document_hash}."
|
||||||
|
) from e
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return True
|
||||||
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# access self.path_or_stream to load stuff
|
# access self.path_or_stream to load stuff
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
print("Trying to convert HTML...")
|
_log.debug("Trying to convert HTML...")
|
||||||
# Replace <br> tags with newline characters
|
# Replace <br> tags with newline characters
|
||||||
for br in self.soup.body.find_all("br"):
|
for br in self.soup.body.find_all("br"):
|
||||||
br.replace_with("\n")
|
br.replace_with("\n")
|
||||||
@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def analyse_element(self, element, idx, doc):
|
def analyse_element(self, element, idx, doc):
|
||||||
"""
|
"""
|
||||||
if element.name!=None:
|
if element.name!=None:
|
||||||
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if element.name in self.labels:
|
if element.name in self.labels:
|
||||||
@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
doc.add_table(data=data, parent=self.parents[self.level])
|
doc.add_table(data=data, parent=self.parents[self.level])
|
||||||
|
|
||||||
def get_list_text(list_element, level=0):
|
def get_list_text(self, list_element, level=0):
|
||||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||||
result = []
|
result = []
|
||||||
bullet_char = "*" # Default bullet character for unordered lists
|
bullet_char = "*" # Default bullet character for unordered lists
|
||||||
@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Handle nested lists
|
# Handle nested lists
|
||||||
nested_list = li.find(["ul", "ol"])
|
nested_list = li.find(["ul", "ol"])
|
||||||
if nested_list:
|
if nested_list:
|
||||||
result.extend(get_list_text(nested_list, level + 1))
|
result.extend(self.get_list_text(nested_list, level + 1))
|
||||||
elif list_element.name == "ul": # For unordered lists, use bullet points
|
elif list_element.name == "ul": # For unordered lists, use bullet points
|
||||||
for li in list_element.find_all("li", recursive=False):
|
for li in list_element.find_all("li", recursive=False):
|
||||||
# Add bullet points for unordered lists
|
# Add bullet points for unordered lists
|
||||||
@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Handle nested lists
|
# Handle nested lists
|
||||||
nested_list = li.find(["ul", "ol"])
|
nested_list = li.find(["ul", "ol"])
|
||||||
if nested_list:
|
if nested_list:
|
||||||
result.extend(get_list_text(nested_list, level + 1))
|
result.extend(self.get_list_text(nested_list, level + 1))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
|
|
||||||
self.pptx_obj = None
|
self.pptx_obj = None
|
||||||
self.valid = True
|
self.valid = False
|
||||||
try:
|
try:
|
||||||
self.pptx_obj = Presentation(self.path_or_stream)
|
self.pptx_obj = Presentation(self.path_or_stream)
|
||||||
|
self.valid = True
|
||||||
except Exception:
|
except Exception:
|
||||||
_log.error("could not parse pptx")
|
raise RuntimeError(
|
||||||
self.valid = False
|
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
||||||
|
) from e
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# self.initialise(path_or_stream)
|
# self.initialise(path_or_stream)
|
||||||
# Word file:
|
# Word file:
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
|
self.valid = False
|
||||||
# Initialise the parents for the hierarchy
|
# Initialise the parents for the hierarchy
|
||||||
self.max_levels = 10
|
self.max_levels = 10
|
||||||
self.level_at_new_list = None
|
self.level_at_new_list = None
|
||||||
@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"indents": [None],
|
"indents": [None],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.docx_obj = None
|
||||||
|
try:
|
||||||
|
self.docx_obj = docx.Document(self.path_or_stream)
|
||||||
|
self.valid = True
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
||||||
|
) from e
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# Parses the DOCX into a structured document model.
|
# Parses the DOCX into a structured document model.
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
docx_obj = None
|
|
||||||
try:
|
|
||||||
docx_obj = docx.Document(self.path_or_stream)
|
|
||||||
except Exception:
|
|
||||||
_log.error("could not parse docx")
|
|
||||||
return doc
|
|
||||||
|
|
||||||
# self.initialise()
|
# self.initialise()
|
||||||
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
|
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def update_history(self, name, level, numid, ilevel):
|
def update_history(self, name, level, numid, ilevel):
|
||||||
|
@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|||||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||||
except PdfiumError as e:
|
except PdfiumError as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"pypdfium could not load document {document_hash}"
|
f"pypdfium could not load document with hash {document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
@ -171,7 +172,7 @@ class ConvertedDocument(BaseModel):
|
|||||||
pages: List[Page] = []
|
pages: List[Page] = []
|
||||||
assembled: AssembledUnit = AssembledUnit()
|
assembled: AssembledUnit = AssembledUnit()
|
||||||
|
|
||||||
legacy_output: DsDocument = _EMPTY_LEGACY_DOC
|
legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC
|
||||||
output: DoclingDocument = _EMPTY_DOCLING_DOC
|
output: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||||
|
|
||||||
def _to_legacy_document(self) -> DsDocument:
|
def _to_legacy_document(self) -> DsDocument:
|
||||||
@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _guess_format(self, obj):
|
def _guess_format(self, obj):
|
||||||
|
content = None
|
||||||
if isinstance(obj, Path):
|
if isinstance(obj, Path):
|
||||||
mime = filetype.guess_mime(str(obj))
|
mime = filetype.guess_mime(str(obj))
|
||||||
elif isinstance(obj, DocumentStream):
|
if mime is None:
|
||||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
with obj.open("rb") as f:
|
||||||
if mime is None:
|
content = f.read(1024) # Read first 1KB
|
||||||
# TODO improve this.
|
|
||||||
|
|
||||||
if obj.suffix == ".html":
|
elif isinstance(obj, DocumentStream):
|
||||||
mime = "text/html"
|
obj.stream.seek(0)
|
||||||
|
content = obj.stream.read(8192)
|
||||||
|
obj.stream.seek(0)
|
||||||
|
mime = filetype.guess_mime(content)
|
||||||
|
|
||||||
|
if mime is None:
|
||||||
|
mime = self._detect_html_xhtml(content)
|
||||||
|
|
||||||
format = MimeTypeToFormat.get(mime)
|
format = MimeTypeToFormat.get(mime)
|
||||||
return format
|
return format
|
||||||
|
|
||||||
|
def _detect_html_xhtml(self, content):
|
||||||
|
content_str = content.decode("ascii", errors="ignore").lower()
|
||||||
|
# Remove XML comments
|
||||||
|
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
||||||
|
content_str = content_str.lstrip()
|
||||||
|
|
||||||
|
if re.match(r"<\?xml", content_str):
|
||||||
|
if "xhtml" in content_str[:1000]:
|
||||||
|
return "application/xhtml+xml"
|
||||||
|
|
||||||
|
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
||||||
|
return "text/html"
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||||
paths = [Path(p) for p in paths]
|
paths = [Path(p) for p in paths]
|
||||||
|
@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel): ...
|
class PipelineOptions(BaseModel):
|
||||||
|
create_legacy_output: bool = (
|
||||||
|
True # This defautl will be set to False on a future version of docling
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PipelineOptions):
|
class PdfPipelineOptions(PipelineOptions):
|
||||||
|
@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
|
|||||||
class GlmModel:
|
class GlmModel:
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.create_legacy_output = config.get("create_legacy_output", True)
|
||||||
|
|
||||||
self.model_names = self.config.get(
|
self.model_names = self.config.get(
|
||||||
"model_names", ""
|
"model_names", ""
|
||||||
) # "language;term;reference"
|
) # "language;term;reference"
|
||||||
@ -42,7 +44,10 @@ class GlmModel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||||
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
|
legacy_doc: DsLegacyDocument = None
|
||||||
|
|
||||||
|
if self.create_legacy_output:
|
||||||
|
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_clusters_and_cells(ds_document, page_no):
|
def draw_clusters_and_cells(ds_document, page_no):
|
||||||
@ -92,4 +97,4 @@ class GlmModel:
|
|||||||
# draw_clusters_and_cells(ds_doc, 0)
|
# draw_clusters_and_cells(ds_doc, 0)
|
||||||
# draw_clusters_and_cells(exported_doc, 0)
|
# draw_clusters_and_cells(exported_doc, 0)
|
||||||
|
|
||||||
return (legacy_doc, docling_doc)
|
return (docling_doc, legacy_doc)
|
||||||
|
@ -41,7 +41,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
artifacts_path = self.download_models_hf()
|
artifacts_path = self.download_models_hf()
|
||||||
|
|
||||||
self.artifacts_path = Path(artifacts_path)
|
self.artifacts_path = Path(artifacts_path)
|
||||||
self.glm_model = GlmModel(config={})
|
self.glm_model = GlmModel(
|
||||||
|
config={"create_legacy_output": pipeline_options.create_legacy_output}
|
||||||
|
)
|
||||||
|
|
||||||
if ocr_model := self.get_ocr_model() is None:
|
if ocr_model := self.get_ocr_model() is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@ -140,7 +142,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
elements=all_elements, headers=all_headers, body=all_body
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
|
conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ def main():
|
|||||||
]
|
]
|
||||||
|
|
||||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
|
||||||
# input = DocumentConversionInput.from_streams(docs)
|
# input = DocumentConversionInput.from_streams(docs)
|
||||||
|
|
||||||
doc_converter = DocumentConverter()
|
doc_converter = DocumentConverter()
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -38,6 +39,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
InputFormat.PDF,
|
InputFormat.PDF,
|
||||||
# InputFormat.IMAGE,
|
# InputFormat.IMAGE,
|
||||||
InputFormat.DOCX,
|
InputFormat.DOCX,
|
||||||
|
InputFormat.HTML,
|
||||||
], # whitelist formats, other files are ignored.
|
], # whitelist formats, other files are ignored.
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
@ -53,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
conv_results = doc_converter.convert_batch(input)
|
conv_results = doc_converter.convert_batch(input)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
out_path = Path("./scratch")
|
||||||
print(
|
print(
|
||||||
f"Document {res.input.file.name} converted with status {res.status}."
|
f"Document {res.input.file.name} converted with status {res.status}."
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
f"\nSaved markdown output to: {str(out_path)}"
|
||||||
)
|
)
|
||||||
# print(res.experimental.export_to_markdown())
|
# print(res.experimental.export_to_markdown())
|
||||||
# Export Docling document format to markdown (experimental):
|
# Export Docling document format to markdown (experimental):
|
||||||
with out_path.open("w") as fp:
|
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
|
||||||
fp.write(res.output.export_to_markdown())
|
fp.write(res.output.export_to_markdown())
|
||||||
|
|
||||||
|
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
|
||||||
|
fp.write(json.dumps(res.output.export_to_dict()))
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user