structured-llm-ready-doc-converter

Sleeping

pierreguillou commited on Oct 15, 2025

Commit

18f46da

verified ·

1 Parent(s): 89c6707

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from docling.datamodel.base_models import InputFormat
 # --- START OF OCR CONFIGURATION ---
 # Configure DocLing converter with Tesseract OCR enabled
 pdf_options = PdfPipelineOptions(
-    do_ocr=True,
     ocr_model="tesseract",
     ocr_languages=[
     "eng", "fra", "deu", "spa", "ita", "por", "nld", "pol", "tur", "ces", "rus", "ukr", "ell", "ron", "hun",
@@ -165,7 +165,7 @@ def reset_form():
 # Gradio Interface
 with gr.Blocks(title="LLM-Ready Document Converter") as app:
-    gr.Markdown("# 📄 Document Converter to LLM-ready")
     gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
     gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
@@ -209,7 +209,5 @@ with gr.Blocks(title="LLM-Ready Document Converter") as app:
 if __name__ == "__main__":
     app.launch(
-        # server_name="0.0.0.0",
-        # server_port=7860,
         share=True
     )

 # --- START OF OCR CONFIGURATION ---
 # Configure DocLing converter with Tesseract OCR enabled
 pdf_options = PdfPipelineOptions(
+    do_ocr=False,
     ocr_model="tesseract",
     ocr_languages=[
     "eng", "fra", "deu", "spa", "ita", "por", "nld", "pol", "tur", "ces", "rus", "ukr", "ell", "ron", "hun",
 # Gradio Interface
 with gr.Blocks(title="LLM-Ready Document Converter") as app:
+    gr.Markdown("# 📄 LLM-Ready Document Converter")
     gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
     gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
 if __name__ == "__main__":
     app.launch(
         share=True
     )