pierreguillou commited on
Commit
ad6d0d0
·
verified ·
1 Parent(s): 00301da

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -0
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ import json
7
+
8
+ # Import DocLing and necessary configuration classes
9
+ from docling.document_converter import DocumentConverter, PdfFormatOption
10
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
11
+ from docling.datamodel.base_models import InputFormat
12
+
13
+ # --- START OF OCR CONFIGURATION ---
14
+ # Configure DocLing converter with Tesseract OCR enabled
15
+ pdf_options = PdfPipelineOptions(
16
+ do_ocr=False,
17
+ ocr_model="tesseract",
18
+ ocr_languages=[
19
+ "eng", "fra", "deu", "spa", "ita", "por", "nld", "pol", "tur", "ces", "rus", "ukr", "ell", "ron", "hun",
20
+ "bul", "hrv", "srp", "slk", "slv", "lit", "lav", "est", "cat", "eus", "glg", "isl", "dan", "nor", "swe",
21
+ "fin", "alb", "mlt", "afr", "zul", "swa", "amh", "uzb", "aze", "kaz", "kir", "mon", "tgl", "ind", "msa",
22
+ "tha", "vie", "khm", "lao", "mya", "ben", "hin", "mar", "guj", "pan", "mal", "tam", "tel", "kan", "nep",
23
+ "sin", "urd", "fas", "pus", "kur", "aze_cyrl", "tat", "uig", "heb", "ara", "yid", "grc", "chr", "epo",
24
+ "hye", "kat", "kat_old", "aze_latn", "mkd", "bel", "srp_latn", "srp_cyrillic",
25
+ "chi_sim", "chi_tra", "jpn", "kor"
26
+ ]
27
+ )
28
+
29
+ # Create the format-specific configuration
30
+ format_options = {
31
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
32
+ }
33
+
34
+ # Initialize the converter with the OCR configuration
35
+ docling_converter = DocumentConverter(format_options=format_options)
36
+ # --- END OF OCR CONFIGURATION ---
37
+
38
+
39
+ def process_file(file):
40
+ """
41
+ Process an uploaded file and return 4 files:
42
+ 1. Docling document (JSON)
43
+ 2. Text file
44
+ 3. Markdown file
45
+ 4. HTML file
46
+ """
47
+ if file is None:
48
+ return None, None, None, None, "❌ Error: Please upload a file."
49
+
50
+ # Normalize to a filesystem path string
51
+ path = file.name if hasattr(file, "name") else str(file)
52
+ ext = os.path.splitext(path)[1].lower()
53
+
54
+ docling_direct = {".pdf", ".docx", ".xlsx", ".pptx"}
55
+ to_xlsx_first = {".csv", ".xls"}
56
+
57
+ try:
58
+ # Convert CSV/XLS to XLSX first if needed
59
+ if ext in to_xlsx_first:
60
+ if ext == ".csv":
61
+ df = pd.read_csv(path)
62
+ else: # .xls
63
+ df = pd.read_excel(path)
64
+
65
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
66
+ df.to_excel(tmp.name, index=False)
67
+ path = tmp.name
68
+
69
+ # Process with DocLing
70
+ if ext in docling_direct or ext in to_xlsx_first:
71
+ result = docling_converter.convert(path)
72
+
73
+ # Generate timestamp for filenames
74
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
75
+ base_filename = f"document_{timestamp}"
76
+
77
+ # 1. Docling document (JSON)
78
+ docling_json_path = f"{base_filename}_docling.json"
79
+ with open(docling_json_path, "w", encoding="utf-8") as f:
80
+ json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
81
+
82
+ # 2. Text file
83
+ txt_path = f"{base_filename}.txt"
84
+ with open(txt_path, "w", encoding="utf-8") as f:
85
+ f.write(result.document.export_to_text())
86
+
87
+ # 3. Markdown file
88
+ md_path = f"{base_filename}.md"
89
+ with open(md_path, "w", encoding="utf-8") as f:
90
+ f.write(result.document.export_to_markdown())
91
+
92
+ # 4. HTML file
93
+ html_path = f"{base_filename}.html"
94
+ html_content = result.document.export_to_html()
95
+ with open(html_path, "w", encoding="utf-8") as f:
96
+ f.write(html_content)
97
+
98
+ success_message = f"✅ Successfully processed file! 4 files generated."
99
+ return docling_json_path, txt_path, md_path, html_path, success_message
100
+
101
+ elif ext == ".txt":
102
+ # For plain text files, create all formats
103
+ with open(path, "r", encoding="utf-8") as f:
104
+ text_content = f.read()
105
+
106
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
107
+ base_filename = f"document_{timestamp}"
108
+
109
+ # 1. Docling document (JSON) - simple structure for text
110
+ docling_json_path = f"{base_filename}_docling.json"
111
+ docling_dict = {
112
+ "type": "text_document",
113
+ "content": text_content,
114
+ "metadata": {
115
+ "source": os.path.basename(path),
116
+ "timestamp": timestamp
117
+ }
118
+ }
119
+ with open(docling_json_path, "w", encoding="utf-8") as f:
120
+ json.dump(docling_dict, f, indent=2, ensure_ascii=False)
121
+
122
+ # 2. Text file
123
+ txt_path = f"{base_filename}.txt"
124
+ with open(txt_path, "w", encoding="utf-8") as f:
125
+ f.write(text_content)
126
+
127
+ # 3. Markdown file
128
+ md_path = f"{base_filename}.md"
129
+ with open(md_path, "w", encoding="utf-8") as f:
130
+ f.write(f"# Document\n\n{text_content}")
131
+
132
+ # 4. HTML file
133
+ html_path = f"{base_filename}.html"
134
+ html_content = f"""<!DOCTYPE html>
135
+ <html lang="en">
136
+ <head>
137
+ <meta charset="UTF-8">
138
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
139
+ <title>Document</title>
140
+ </head>
141
+ <body>
142
+ <pre>{text_content}</pre>
143
+ </body>
144
+ </html>"""
145
+ with open(html_path, "w", encoding="utf-8") as f:
146
+ f.write(html_content)
147
+
148
+ success_message = f"✅ Successfully processed text file! 4 files generated."
149
+ return docling_json_path, txt_path, md_path, html_path, success_message
150
+
151
+ else:
152
+ error_message = f"❌ Unsupported file format: {ext}"
153
+ return None, None, None, None, error_message
154
+
155
+ except Exception as e:
156
+ error_message = f"❌ Error processing file: {str(e)}"
157
+ return None, None, None, None, error_message
158
+
159
+
160
+ def reset_form():
161
+ """Reset the form"""
162
+ return None, None, None, None, None, ""
163
+
164
+
165
+ # Gradio Interface
166
+ with gr.Blocks(title="LLM-Ready Document Converter") as app:
167
+
168
+ gr.Markdown("# 📄 Document Converter to LLM-ready")
169
+ gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
170
+ gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
171
+
172
+ with gr.Row():
173
+ with gr.Column():
174
+ file_input = gr.File(
175
+ label="Upload Document",
176
+ file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
177
+ )
178
+
179
+ with gr.Row():
180
+ submit_btn = gr.Button("Convert Document", variant="primary")
181
+ reset_btn = gr.Button("Reset")
182
+
183
+ status_output = gr.Markdown(label="Status")
184
+
185
+ with gr.Row():
186
+ with gr.Column():
187
+ docling_output = gr.File(label="Docling Document (JSON)")
188
+ with gr.Column():
189
+ txt_output = gr.File(label="Text File")
190
+
191
+ with gr.Row():
192
+ with gr.Column():
193
+ md_output = gr.File(label="Markdown File")
194
+ with gr.Column():
195
+ html_output = gr.File(label="HTML File")
196
+
197
+ # Events
198
+ submit_btn.click(
199
+ fn=process_file,
200
+ inputs=[file_input],
201
+ outputs=[docling_output, txt_output, md_output, html_output, status_output]
202
+ )
203
+
204
+ reset_btn.click(
205
+ fn=reset_form,
206
+ outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
207
+ )
208
+
209
+
210
+ if __name__ == "__main__":
211
+ app.launch(
212
+ share=True
213
+ )