structured-llm-ready-doc-converter

Sleeping

App Files Files Community

pierreguillou commited on Oct 15, 2025

Commit

6bd2c76

verified ·

1 Parent(s): bf8492c

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -49

app.py CHANGED Viewed

@@ -4,37 +4,83 @@ import tempfile
 from datetime import datetime
 import pandas as pd
 import json
 # Import DocLing and necessary configuration classes
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.base_models import InputFormat
 # --- START OF OCR CONFIGURATION ---
-# Configure DocLing converter with Tesseract OCR enabled
 pdf_options = PdfPipelineOptions(
     do_ocr=False,
-    ocr_model="tesseract",
-    ocr_languages=[
-    "eng", "fra", "deu", "spa", "ita", "por", "nld", "pol", "tur", "ces", "rus", "ukr", "ell", "ron", "hun",
-    "bul", "hrv", "srp", "slk", "slv", "lit", "lav", "est", "cat", "eus", "glg", "isl", "dan", "nor", "swe",
-    "fin", "alb", "mlt", "afr", "zul", "swa", "amh", "uzb", "aze", "kaz", "kir", "mon", "tgl", "ind", "msa",
-    "tha", "vie", "khm", "lao", "mya", "ben", "hin", "mar", "guj", "pan", "mal", "tam", "tel", "kan", "nep",
-    "sin", "urd", "fas", "pus", "kur", "aze_cyrl", "tat", "uig", "heb", "ara", "yid", "grc", "chr", "epo",
-    "hye", "kat", "kat_old", "aze_latn", "mkd", "bel", "srp_latn", "srp_cyrillic",
-    "chi_sim", "chi_tra", "jpn", "kor"
-    ]
 )
-# Create the format-specific configuration
-format_options = {
-    InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
-}
-# Initialize the converter with the OCR configuration
 docling_converter = DocumentConverter(format_options=format_options)
 # --- END OF OCR CONFIGURATION ---
 def process_file(file):
     """
@@ -64,48 +110,55 @@ def process_file(file):
             with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
                 df.to_excel(tmp.name, index=False)
-            path = tmp.name
         # Process with DocLing
         if ext in docling_direct or ext in to_xlsx_first:
-            result = docling_converter.convert(path)
             # Generate timestamp for filenames
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             base_filename = f"document_{timestamp}"
             # 1. Docling document (JSON)
             docling_json_path = f"{base_filename}_docling.json"
             with open(docling_json_path, "w", encoding="utf-8") as f:
                 json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
             # 2. Text file
             txt_path = f"{base_filename}.txt"
             with open(txt_path, "w", encoding="utf-8") as f:
-                f.write(result.document.export_to_text())
             # 3. Markdown file
             md_path = f"{base_filename}.md"
             with open(md_path, "w", encoding="utf-8") as f:
-                f.write(result.document.export_to_markdown())
             # 4. HTML file
             html_path = f"{base_filename}.html"
-            html_content = result.document.export_to_html()
             with open(html_path, "w", encoding="utf-8") as f:
-                f.write(html_content)
-            success_message = f"✅ Successfully processed file! 4 files generated."
             return docling_json_path, txt_path, md_path, html_path, success_message
         elif ext == ".txt":
             # For plain text files, create all formats
             with open(path, "r", encoding="utf-8") as f:
                 text_content = f.read()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             base_filename = f"document_{timestamp}"
             # 1. Docling document (JSON) - simple structure for text
             docling_json_path = f"{base_filename}_docling.json"
             docling_dict = {
@@ -118,17 +171,17 @@ def process_file(file):
             }
             with open(docling_json_path, "w", encoding="utf-8") as f:
                 json.dump(docling_dict, f, indent=2, ensure_ascii=False)
             # 2. Text file
             txt_path = f"{base_filename}.txt"
             with open(txt_path, "w", encoding="utf-8") as f:
                 f.write(text_content)
             # 3. Markdown file
             md_path = f"{base_filename}.md"
             with open(md_path, "w", encoding="utf-8") as f:
                 f.write(f"# Document\n\n{text_content}")
             # 4. HTML file
             html_path = f"{base_filename}.html"
             html_content = f"""<!DOCTYPE html>
@@ -144,8 +197,8 @@ def process_file(file):
 </html>"""
             with open(html_path, "w", encoding="utf-8") as f:
                 f.write(html_content)
-            success_message = f"✅ Successfully processed text file! 4 files generated."
             return docling_json_path, txt_path, md_path, html_path, success_message
         else:
@@ -156,15 +209,12 @@ def process_file(file):
         error_message = f"❌ Error processing file: {str(e)}"
         return None, None, None, None, error_message
 def reset_form():
     """Reset the form"""
     return None, None, None, None, None, ""
 # Gradio Interface
 with gr.Blocks(title="LLM-Ready Document Converter") as app:
     gr.Markdown("# 📄 LLM-Ready Document Converter")
     gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
     gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
@@ -175,10 +225,10 @@ with gr.Blocks(title="LLM-Ready Document Converter") as app:
                 label="Upload Document",
                 file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
             )
-            with gr.Row():
-                submit_btn = gr.Button("Convert Document", variant="primary")
-                reset_btn = gr.Button("Reset")
     status_output = gr.Markdown(label="Status")
@@ -206,8 +256,5 @@ with gr.Blocks(title="LLM-Ready Document Converter") as app:
         outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
     )
 if __name__ == "__main__":
-    app.launch(
-        share=True
-    )

 from datetime import datetime
 import pandas as pd
 import json
+import unicodedata
 # Import DocLing and necessary configuration classes
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.base_models import InputFormat
+# --- Language detection and normalization helpers ---
+try:
+    from ftfy import fix_text
+    def _fix_text(s: str) -> str:
+        return fix_text(s)
+except ImportError:
+    def _fix_text(s: str) -> str:
+        return s
+try:
+    from langdetect import detect, DetectorFactory
+    DetectorFactory.seed = 0  # deterministic
+    def _detect_lang(text: str) -> str | None:
+        try:
+            return detect(text)
+        except Exception:
+            return None
+except ImportError:
+    def _detect_lang(text: str) -> str | None:
+        return None
+def normalize_text(s: str) -> str:
+    s = _fix_text(s)
+    return unicodedata.normalize("NFC", s)
+# Map ISO-ish lang codes to Tesseract codes
+LANG_MAP = {
+    "pt": "por", "es": "spa", "en": "eng", "fr": "fra", "de": "deu", "it": "ita",
+    "nl": "nld", "pl": "pol", "tr": "tur", "cs": "ces", "ru": "rus", "uk": "ukr",
+    "el": "ell", "ro": "ron", "hu": "hun", "sv": "swe", "da": "dan", "fi": "fin",
+    "no": "nor", "ca": "cat", "gl": "glg"
+}
+def guess_lang_code(text: str) -> str | None:
+    lang = _detect_lang(text) if text and text.strip() else None
+    return LANG_MAP.get(lang) if lang else None
+def looks_garbled(text: str) -> bool:
+    if not text or len(text.strip()) < 100:
+        return True
+    # Common mojibake signs
+    bad_patterns = ["Ã", "Â", "�", "ª"]
+    return sum(text.count(p) for p in bad_patterns) > 5
+# --- End helpers ---
 # --- START OF OCR CONFIGURATION ---
+# Default: do_ocr=False (use native text layer). When OCR is needed, we'll build options dynamically.
 pdf_options = PdfPipelineOptions(
     do_ocr=False,
+    ocr_model="tesseract"
 )
+format_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)}
 docling_converter = DocumentConverter(format_options=format_options)
 # --- END OF OCR CONFIGURATION ---
+def convert_with_strategy(path: str):
+    # 1) No-OCR pass
+    no_ocr_opts = PdfPipelineOptions(do_ocr=False, ocr_model="tesseract")
+    converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=no_ocr_opts)})
+    res = converter.convert(path)
+    text_sample = normalize_text(res.document.export_to_text())
+    if not looks_garbled(text_sample):
+        return res
+    # 2) OCR fallback with detected language (default to Portuguese)
+    detected = guess_lang_code(text_sample) or "por"
+    ocr_opts = PdfPipelineOptions(do_ocr=True, ocr_model="tesseract", ocr_languages=[detected])
+    ocr_converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=ocr_opts)})
+    return ocr_converter.convert(path)
 def process_file(file):
     """
             with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
                 df.to_excel(tmp.name, index=False)
+                path = tmp.name
         # Process with DocLing
         if ext in docling_direct or ext in to_xlsx_first:
+            result = convert_with_strategy(path)
             # Generate timestamp for filenames
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             base_filename = f"document_{timestamp}"
             # 1. Docling document (JSON)
             docling_json_path = f"{base_filename}_docling.json"
             with open(docling_json_path, "w", encoding="utf-8") as f:
                 json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
+            # Normalize outputs
+            text_out = normalize_text(result.document.export_to_text())
+            md_out = normalize_text(result.document.export_to_markdown())
+            html_out = normalize_text(result.document.export_to_html())
             # 2. Text file
             txt_path = f"{base_filename}.txt"
             with open(txt_path, "w", encoding="utf-8") as f:
+                f.write(text_out)
             # 3. Markdown file
             md_path = f"{base_filename}.md"
             with open(md_path, "w", encoding="utf-8") as f:
+                f.write(md_out)
             # 4. HTML file
             html_path = f"{base_filename}.html"
             with open(html_path, "w", encoding="utf-8") as f:
+                f.write(html_out)
+            success_message = "✅ Successfully processed file! 4 files generated."
             return docling_json_path, txt_path, md_path, html_path, success_message
         elif ext == ".txt":
             # For plain text files, create all formats
             with open(path, "r", encoding="utf-8") as f:
                 text_content = f.read()
+            # Normalize input text as requested
+            text_content = normalize_text(text_content)
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             base_filename = f"document_{timestamp}"
             # 1. Docling document (JSON) - simple structure for text
             docling_json_path = f"{base_filename}_docling.json"
             docling_dict = {
             }
             with open(docling_json_path, "w", encoding="utf-8") as f:
                 json.dump(docling_dict, f, indent=2, ensure_ascii=False)
             # 2. Text file
             txt_path = f"{base_filename}.txt"
             with open(txt_path, "w", encoding="utf-8") as f:
                 f.write(text_content)
             # 3. Markdown file
             md_path = f"{base_filename}.md"
             with open(md_path, "w", encoding="utf-8") as f:
                 f.write(f"# Document\n\n{text_content}")
             # 4. HTML file
             html_path = f"{base_filename}.html"
             html_content = f"""<!DOCTYPE html>
 </html>"""
             with open(html_path, "w", encoding="utf-8") as f:
                 f.write(html_content)
+            success_message = "✅ Successfully processed text file! 4 files generated."
             return docling_json_path, txt_path, md_path, html_path, success_message
         else:
         error_message = f"❌ Error processing file: {str(e)}"
         return None, None, None, None, error_message
 def reset_form():
     """Reset the form"""
     return None, None, None, None, None, ""
 # Gradio Interface
 with gr.Blocks(title="LLM-Ready Document Converter") as app:
     gr.Markdown("# 📄 LLM-Ready Document Converter")
     gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
     gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
                 label="Upload Document",
                 file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
             )
+    with gr.Row():
+        submit_btn = gr.Button("Convert Document", variant="primary")
+        reset_btn = gr.Button("Reset")
     status_output = gr.Markdown(label="Status")
         outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
     )
 if __name__ == "__main__":
+    app.launch(share=True)