pierreguillou commited on
Commit
6bd2c76
·
verified ·
1 Parent(s): bf8492c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -49
app.py CHANGED
@@ -4,37 +4,83 @@ import tempfile
4
  from datetime import datetime
5
  import pandas as pd
6
  import json
 
7
 
8
  # Import DocLing and necessary configuration classes
9
  from docling.document_converter import DocumentConverter, PdfFormatOption
10
  from docling.datamodel.pipeline_options import PdfPipelineOptions
11
  from docling.datamodel.base_models import InputFormat
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # --- START OF OCR CONFIGURATION ---
14
- # Configure DocLing converter with Tesseract OCR enabled
15
  pdf_options = PdfPipelineOptions(
16
  do_ocr=False,
17
- ocr_model="tesseract",
18
- ocr_languages=[
19
- "eng", "fra", "deu", "spa", "ita", "por", "nld", "pol", "tur", "ces", "rus", "ukr", "ell", "ron", "hun",
20
- "bul", "hrv", "srp", "slk", "slv", "lit", "lav", "est", "cat", "eus", "glg", "isl", "dan", "nor", "swe",
21
- "fin", "alb", "mlt", "afr", "zul", "swa", "amh", "uzb", "aze", "kaz", "kir", "mon", "tgl", "ind", "msa",
22
- "tha", "vie", "khm", "lao", "mya", "ben", "hin", "mar", "guj", "pan", "mal", "tam", "tel", "kan", "nep",
23
- "sin", "urd", "fas", "pus", "kur", "aze_cyrl", "tat", "uig", "heb", "ara", "yid", "grc", "chr", "epo",
24
- "hye", "kat", "kat_old", "aze_latn", "mkd", "bel", "srp_latn", "srp_cyrillic",
25
- "chi_sim", "chi_tra", "jpn", "kor"
26
- ]
27
  )
28
-
29
- # Create the format-specific configuration
30
- format_options = {
31
- InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
32
- }
33
-
34
- # Initialize the converter with the OCR configuration
35
  docling_converter = DocumentConverter(format_options=format_options)
36
  # --- END OF OCR CONFIGURATION ---
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def process_file(file):
40
  """
@@ -64,48 +110,55 @@ def process_file(file):
64
 
65
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
66
  df.to_excel(tmp.name, index=False)
67
- path = tmp.name
68
 
69
  # Process with DocLing
70
  if ext in docling_direct or ext in to_xlsx_first:
71
- result = docling_converter.convert(path)
72
-
73
  # Generate timestamp for filenames
74
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
75
  base_filename = f"document_{timestamp}"
76
-
77
  # 1. Docling document (JSON)
78
  docling_json_path = f"{base_filename}_docling.json"
79
  with open(docling_json_path, "w", encoding="utf-8") as f:
80
  json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
81
-
 
 
 
 
 
82
  # 2. Text file
83
  txt_path = f"{base_filename}.txt"
84
  with open(txt_path, "w", encoding="utf-8") as f:
85
- f.write(result.document.export_to_text())
86
-
87
  # 3. Markdown file
88
  md_path = f"{base_filename}.md"
89
  with open(md_path, "w", encoding="utf-8") as f:
90
- f.write(result.document.export_to_markdown())
91
-
92
  # 4. HTML file
93
  html_path = f"{base_filename}.html"
94
- html_content = result.document.export_to_html()
95
  with open(html_path, "w", encoding="utf-8") as f:
96
- f.write(html_content)
97
-
98
- success_message = f"✅ Successfully processed file! 4 files generated."
99
  return docling_json_path, txt_path, md_path, html_path, success_message
100
 
101
  elif ext == ".txt":
102
  # For plain text files, create all formats
103
  with open(path, "r", encoding="utf-8") as f:
104
  text_content = f.read()
105
-
 
 
 
106
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
107
  base_filename = f"document_{timestamp}"
108
-
109
  # 1. Docling document (JSON) - simple structure for text
110
  docling_json_path = f"{base_filename}_docling.json"
111
  docling_dict = {
@@ -118,17 +171,17 @@ def process_file(file):
118
  }
119
  with open(docling_json_path, "w", encoding="utf-8") as f:
120
  json.dump(docling_dict, f, indent=2, ensure_ascii=False)
121
-
122
  # 2. Text file
123
  txt_path = f"{base_filename}.txt"
124
  with open(txt_path, "w", encoding="utf-8") as f:
125
  f.write(text_content)
126
-
127
  # 3. Markdown file
128
  md_path = f"{base_filename}.md"
129
  with open(md_path, "w", encoding="utf-8") as f:
130
  f.write(f"# Document\n\n{text_content}")
131
-
132
  # 4. HTML file
133
  html_path = f"{base_filename}.html"
134
  html_content = f"""<!DOCTYPE html>
@@ -144,8 +197,8 @@ def process_file(file):
144
  </html>"""
145
  with open(html_path, "w", encoding="utf-8") as f:
146
  f.write(html_content)
147
-
148
- success_message = f"✅ Successfully processed text file! 4 files generated."
149
  return docling_json_path, txt_path, md_path, html_path, success_message
150
 
151
  else:
@@ -156,15 +209,12 @@ def process_file(file):
156
  error_message = f"❌ Error processing file: {str(e)}"
157
  return None, None, None, None, error_message
158
 
159
-
160
  def reset_form():
161
  """Reset the form"""
162
  return None, None, None, None, None, ""
163
 
164
-
165
  # Gradio Interface
166
  with gr.Blocks(title="LLM-Ready Document Converter") as app:
167
-
168
  gr.Markdown("# 📄 LLM-Ready Document Converter")
169
  gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
170
  gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
@@ -175,10 +225,10 @@ with gr.Blocks(title="LLM-Ready Document Converter") as app:
175
  label="Upload Document",
176
  file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
177
  )
178
-
179
- with gr.Row():
180
- submit_btn = gr.Button("Convert Document", variant="primary")
181
- reset_btn = gr.Button("Reset")
182
 
183
  status_output = gr.Markdown(label="Status")
184
 
@@ -206,8 +256,5 @@ with gr.Blocks(title="LLM-Ready Document Converter") as app:
206
  outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
207
  )
208
 
209
-
210
  if __name__ == "__main__":
211
- app.launch(
212
- share=True
213
- )
 
4
  from datetime import datetime
5
  import pandas as pd
6
  import json
7
+ import unicodedata
8
 
9
  # Import DocLing and necessary configuration classes
10
  from docling.document_converter import DocumentConverter, PdfFormatOption
11
  from docling.datamodel.pipeline_options import PdfPipelineOptions
12
  from docling.datamodel.base_models import InputFormat
13
 
14
+ # --- Language detection and normalization helpers ---
15
+ try:
16
+ from ftfy import fix_text
17
+ def _fix_text(s: str) -> str:
18
+ return fix_text(s)
19
+ except ImportError:
20
+ def _fix_text(s: str) -> str:
21
+ return s
22
+
23
+ try:
24
+ from langdetect import detect, DetectorFactory
25
+ DetectorFactory.seed = 0 # deterministic
26
+ def _detect_lang(text: str) -> str | None:
27
+ try:
28
+ return detect(text)
29
+ except Exception:
30
+ return None
31
+ except ImportError:
32
+ def _detect_lang(text: str) -> str | None:
33
+ return None
34
+
35
+ def normalize_text(s: str) -> str:
36
+ s = _fix_text(s)
37
+ return unicodedata.normalize("NFC", s)
38
+
39
+ # Map ISO-ish lang codes to Tesseract codes
40
+ LANG_MAP = {
41
+ "pt": "por", "es": "spa", "en": "eng", "fr": "fra", "de": "deu", "it": "ita",
42
+ "nl": "nld", "pl": "pol", "tr": "tur", "cs": "ces", "ru": "rus", "uk": "ukr",
43
+ "el": "ell", "ro": "ron", "hu": "hun", "sv": "swe", "da": "dan", "fi": "fin",
44
+ "no": "nor", "ca": "cat", "gl": "glg"
45
+ }
46
+
47
+ def guess_lang_code(text: str) -> str | None:
48
+ lang = _detect_lang(text) if text and text.strip() else None
49
+ return LANG_MAP.get(lang) if lang else None
50
+
51
+ def looks_garbled(text: str) -> bool:
52
+ if not text or len(text.strip()) < 100:
53
+ return True
54
+ # Common mojibake signs
55
+ bad_patterns = ["Ã", "Â", "�", "ª"]
56
+ return sum(text.count(p) for p in bad_patterns) > 5
57
+ # --- End helpers ---
58
+
59
  # --- START OF OCR CONFIGURATION ---
60
+ # Default: do_ocr=False (use native text layer). When OCR is needed, we'll build options dynamically.
61
  pdf_options = PdfPipelineOptions(
62
  do_ocr=False,
63
+ ocr_model="tesseract"
 
 
 
 
 
 
 
 
 
64
  )
65
+ format_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)}
 
 
 
 
 
 
66
  docling_converter = DocumentConverter(format_options=format_options)
67
  # --- END OF OCR CONFIGURATION ---
68
 
69
+ def convert_with_strategy(path: str):
70
+ # 1) No-OCR pass
71
+ no_ocr_opts = PdfPipelineOptions(do_ocr=False, ocr_model="tesseract")
72
+ converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=no_ocr_opts)})
73
+ res = converter.convert(path)
74
+ text_sample = normalize_text(res.document.export_to_text())
75
+
76
+ if not looks_garbled(text_sample):
77
+ return res
78
+
79
+ # 2) OCR fallback with detected language (default to Portuguese)
80
+ detected = guess_lang_code(text_sample) or "por"
81
+ ocr_opts = PdfPipelineOptions(do_ocr=True, ocr_model="tesseract", ocr_languages=[detected])
82
+ ocr_converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=ocr_opts)})
83
+ return ocr_converter.convert(path)
84
 
85
  def process_file(file):
86
  """
 
110
 
111
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
112
  df.to_excel(tmp.name, index=False)
113
+ path = tmp.name
114
 
115
  # Process with DocLing
116
  if ext in docling_direct or ext in to_xlsx_first:
117
+ result = convert_with_strategy(path)
118
+
119
  # Generate timestamp for filenames
120
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
121
  base_filename = f"document_{timestamp}"
122
+
123
  # 1. Docling document (JSON)
124
  docling_json_path = f"{base_filename}_docling.json"
125
  with open(docling_json_path, "w", encoding="utf-8") as f:
126
  json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
127
+
128
+ # Normalize outputs
129
+ text_out = normalize_text(result.document.export_to_text())
130
+ md_out = normalize_text(result.document.export_to_markdown())
131
+ html_out = normalize_text(result.document.export_to_html())
132
+
133
  # 2. Text file
134
  txt_path = f"{base_filename}.txt"
135
  with open(txt_path, "w", encoding="utf-8") as f:
136
+ f.write(text_out)
137
+
138
  # 3. Markdown file
139
  md_path = f"{base_filename}.md"
140
  with open(md_path, "w", encoding="utf-8") as f:
141
+ f.write(md_out)
142
+
143
  # 4. HTML file
144
  html_path = f"{base_filename}.html"
 
145
  with open(html_path, "w", encoding="utf-8") as f:
146
+ f.write(html_out)
147
+
148
+ success_message = "✅ Successfully processed file! 4 files generated."
149
  return docling_json_path, txt_path, md_path, html_path, success_message
150
 
151
  elif ext == ".txt":
152
  # For plain text files, create all formats
153
  with open(path, "r", encoding="utf-8") as f:
154
  text_content = f.read()
155
+
156
+ # Normalize input text as requested
157
+ text_content = normalize_text(text_content)
158
+
159
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
160
  base_filename = f"document_{timestamp}"
161
+
162
  # 1. Docling document (JSON) - simple structure for text
163
  docling_json_path = f"{base_filename}_docling.json"
164
  docling_dict = {
 
171
  }
172
  with open(docling_json_path, "w", encoding="utf-8") as f:
173
  json.dump(docling_dict, f, indent=2, ensure_ascii=False)
174
+
175
  # 2. Text file
176
  txt_path = f"{base_filename}.txt"
177
  with open(txt_path, "w", encoding="utf-8") as f:
178
  f.write(text_content)
179
+
180
  # 3. Markdown file
181
  md_path = f"{base_filename}.md"
182
  with open(md_path, "w", encoding="utf-8") as f:
183
  f.write(f"# Document\n\n{text_content}")
184
+
185
  # 4. HTML file
186
  html_path = f"{base_filename}.html"
187
  html_content = f"""<!DOCTYPE html>
 
197
  </html>"""
198
  with open(html_path, "w", encoding="utf-8") as f:
199
  f.write(html_content)
200
+
201
+ success_message = "✅ Successfully processed text file! 4 files generated."
202
  return docling_json_path, txt_path, md_path, html_path, success_message
203
 
204
  else:
 
209
  error_message = f"❌ Error processing file: {str(e)}"
210
  return None, None, None, None, error_message
211
 
 
212
  def reset_form():
213
  """Reset the form"""
214
  return None, None, None, None, None, ""
215
 
 
216
  # Gradio Interface
217
  with gr.Blocks(title="LLM-Ready Document Converter") as app:
 
218
  gr.Markdown("# 📄 LLM-Ready Document Converter")
219
  gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
220
  gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
 
225
  label="Upload Document",
226
  file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
227
  )
228
+
229
+ with gr.Row():
230
+ submit_btn = gr.Button("Convert Document", variant="primary")
231
+ reset_btn = gr.Button("Reset")
232
 
233
  status_output = gr.Markdown(label="Status")
234
 
 
256
  outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
257
  )
258
 
 
259
  if __name__ == "__main__":
260
+ app.launch(share=True)