structured-llm-ready-doc-converter

Sleeping

App Files Files Community

pierreguillou commited on Oct 15, 2025

Commit

ad6d0d0

verified ·

1 Parent(s): 00301da

Create app.py

Browse files

Files changed (1) hide show

app.py +213 -0

app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import gradio as gr
+import os
+import tempfile
+from datetime import datetime
+import pandas as pd
+import json
+# Import DocLing and necessary configuration classes
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.base_models import InputFormat
+# --- START OF OCR CONFIGURATION ---
+# Configure DocLing converter with Tesseract OCR enabled
+pdf_options = PdfPipelineOptions(
+    do_ocr=False,
+    ocr_model="tesseract",
+    ocr_languages=[
+    "eng", "fra", "deu", "spa", "ita", "por", "nld", "pol", "tur", "ces", "rus", "ukr", "ell", "ron", "hun",
+    "bul", "hrv", "srp", "slk", "slv", "lit", "lav", "est", "cat", "eus", "glg", "isl", "dan", "nor", "swe",
+    "fin", "alb", "mlt", "afr", "zul", "swa", "amh", "uzb", "aze", "kaz", "kir", "mon", "tgl", "ind", "msa",
+    "tha", "vie", "khm", "lao", "mya", "ben", "hin", "mar", "guj", "pan", "mal", "tam", "tel", "kan", "nep",
+    "sin", "urd", "fas", "pus", "kur", "aze_cyrl", "tat", "uig", "heb", "ara", "yid", "grc", "chr", "epo",
+    "hye", "kat", "kat_old", "aze_latn", "mkd", "bel", "srp_latn", "srp_cyrillic",
+    "chi_sim", "chi_tra", "jpn", "kor"
+    ]
+)
+# Create the format-specific configuration
+format_options = {
+    InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
+}
+# Initialize the converter with the OCR configuration
+docling_converter = DocumentConverter(format_options=format_options)
+# --- END OF OCR CONFIGURATION ---
+def process_file(file):
+    """
+    Process an uploaded file and return 4 files:
+    1. Docling document (JSON)
+    2. Text file
+    3. Markdown file
+    4. HTML file
+    """
+    if file is None:
+    return None, None, None, None, "❌ Error: Please upload a file."
+    # Normalize to a filesystem path string
+    path = file.name if hasattr(file, "name") else str(file)
+    ext = os.path.splitext(path)[1].lower()
+    docling_direct = {".pdf", ".docx", ".xlsx", ".pptx"}
+    to_xlsx_first = {".csv", ".xls"}
+    try:
+    # Convert CSV/XLS to XLSX first if needed
+    if ext in to_xlsx_first:
+    if ext == ".csv":
+    df = pd.read_csv(path)
+    else:  # .xls
+    df = pd.read_excel(path)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
+    df.to_excel(tmp.name, index=False)
+    path = tmp.name
+    # Process with DocLing
+    if ext in docling_direct or ext in to_xlsx_first:
+    result = docling_converter.convert(path)
+    # Generate timestamp for filenames
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_filename = f"document_{timestamp}"
+    # 1. Docling document (JSON)
+    docling_json_path = f"{base_filename}_docling.json"
+    with open(docling_json_path, "w", encoding="utf-8") as f:
+    json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
+    # 2. Text file
+    txt_path = f"{base_filename}.txt"
+    with open(txt_path, "w", encoding="utf-8") as f:
+    f.write(result.document.export_to_text())
+    # 3. Markdown file
+    md_path = f"{base_filename}.md"
+    with open(md_path, "w", encoding="utf-8") as f:
+    f.write(result.document.export_to_markdown())
+    # 4. HTML file
+    html_path = f"{base_filename}.html"
+    html_content = result.document.export_to_html()
+    with open(html_path, "w", encoding="utf-8") as f:
+    f.write(html_content)
+    success_message = f"✅ Successfully processed file! 4 files generated."
+    return docling_json_path, txt_path, md_path, html_path, success_message
+    elif ext == ".txt":
+    # For plain text files, create all formats
+    with open(path, "r", encoding="utf-8") as f:
+    text_content = f.read()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_filename = f"document_{timestamp}"
+    # 1. Docling document (JSON) - simple structure for text
+    docling_json_path = f"{base_filename}_docling.json"
+    docling_dict = {
+    "type": "text_document",
+    "content": text_content,
+    "metadata": {
+    "source": os.path.basename(path),
+    "timestamp": timestamp
+    }
+    }
+    with open(docling_json_path, "w", encoding="utf-8") as f:
+    json.dump(docling_dict, f, indent=2, ensure_ascii=False)
+    # 2. Text file
+    txt_path = f"{base_filename}.txt"
+    with open(txt_path, "w", encoding="utf-8") as f:
+    f.write(text_content)
+    # 3. Markdown file
+    md_path = f"{base_filename}.md"
+    with open(md_path, "w", encoding="utf-8") as f:
+    f.write(f"# Document\n\n{text_content}")
+    # 4. HTML file
+    html_path = f"{base_filename}.html"
+    html_content = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Document</title>
+</head>
+<body>
+    <pre>{text_content}</pre>
+</body>
+</html>"""
+    with open(html_path, "w", encoding="utf-8") as f:
+    f.write(html_content)
+    success_message = f"✅ Successfully processed text file! 4 files generated."
+    return docling_json_path, txt_path, md_path, html_path, success_message
+    else:
+    error_message = f"❌ Unsupported file format: {ext}"
+    return None, None, None, None, error_message
+    except Exception as e:
+    error_message = f"❌ Error processing file: {str(e)}"
+    return None, None, None, None, error_message
+def reset_form():
+    """Reset the form"""
+    return None, None, None, None, None, ""
+# Gradio Interface
+with gr.Blocks(title="LLM-Ready Document Converter") as app:
+    gr.Markdown("# 📄 Document Converter to LLM-ready")
+    gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
+    gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
+    with gr.Row():
+    with gr.Column():
+    file_input = gr.File(
+    label="Upload Document",
+    file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
+    )
+    with gr.Row():
+    submit_btn = gr.Button("Convert Document", variant="primary")
+    reset_btn = gr.Button("Reset")
+    status_output = gr.Markdown(label="Status")
+    with gr.Row():
+    with gr.Column():
+    docling_output = gr.File(label="Docling Document (JSON)")
+    with gr.Column():
+    txt_output = gr.File(label="Text File")
+    with gr.Row():
+    with gr.Column():
+    md_output = gr.File(label="Markdown File")
+    with gr.Column():
+    html_output = gr.File(label="HTML File")
+    # Events
+    submit_btn.click(
+    fn=process_file,
+    inputs=[file_input],
+    outputs=[docling_output, txt_output, md_output, html_output, status_output]
+    )
+    reset_btn.click(
+    fn=reset_form,
+    outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
+    )
+if __name__ == "__main__":
+    app.launch(
+    share=True
+    )