Update app.py
Browse files
app.py
CHANGED
|
@@ -4,37 +4,83 @@ import tempfile
|
|
| 4 |
from datetime import datetime
|
| 5 |
import pandas as pd
|
| 6 |
import json
|
|
|
|
| 7 |
|
| 8 |
# Import DocLing and necessary configuration classes
|
| 9 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 10 |
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
| 11 |
from docling.datamodel.base_models import InputFormat
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# --- START OF OCR CONFIGURATION ---
|
| 14 |
-
#
|
| 15 |
pdf_options = PdfPipelineOptions(
|
| 16 |
do_ocr=False,
|
| 17 |
-
ocr_model="tesseract"
|
| 18 |
-
ocr_languages=[
|
| 19 |
-
"eng", "fra", "deu", "spa", "ita", "por", "nld", "pol", "tur", "ces", "rus", "ukr", "ell", "ron", "hun",
|
| 20 |
-
"bul", "hrv", "srp", "slk", "slv", "lit", "lav", "est", "cat", "eus", "glg", "isl", "dan", "nor", "swe",
|
| 21 |
-
"fin", "alb", "mlt", "afr", "zul", "swa", "amh", "uzb", "aze", "kaz", "kir", "mon", "tgl", "ind", "msa",
|
| 22 |
-
"tha", "vie", "khm", "lao", "mya", "ben", "hin", "mar", "guj", "pan", "mal", "tam", "tel", "kan", "nep",
|
| 23 |
-
"sin", "urd", "fas", "pus", "kur", "aze_cyrl", "tat", "uig", "heb", "ara", "yid", "grc", "chr", "epo",
|
| 24 |
-
"hye", "kat", "kat_old", "aze_latn", "mkd", "bel", "srp_latn", "srp_cyrillic",
|
| 25 |
-
"chi_sim", "chi_tra", "jpn", "kor"
|
| 26 |
-
]
|
| 27 |
)
|
| 28 |
-
|
| 29 |
-
# Create the format-specific configuration
|
| 30 |
-
format_options = {
|
| 31 |
-
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
# Initialize the converter with the OCR configuration
|
| 35 |
docling_converter = DocumentConverter(format_options=format_options)
|
| 36 |
# --- END OF OCR CONFIGURATION ---
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def process_file(file):
|
| 40 |
"""
|
|
@@ -64,48 +110,55 @@ def process_file(file):
|
|
| 64 |
|
| 65 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
| 66 |
df.to_excel(tmp.name, index=False)
|
| 67 |
-
|
| 68 |
|
| 69 |
# Process with DocLing
|
| 70 |
if ext in docling_direct or ext in to_xlsx_first:
|
| 71 |
-
result =
|
| 72 |
-
|
| 73 |
# Generate timestamp for filenames
|
| 74 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 75 |
base_filename = f"document_{timestamp}"
|
| 76 |
-
|
| 77 |
# 1. Docling document (JSON)
|
| 78 |
docling_json_path = f"{base_filename}_docling.json"
|
| 79 |
with open(docling_json_path, "w", encoding="utf-8") as f:
|
| 80 |
json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
# 2. Text file
|
| 83 |
txt_path = f"{base_filename}.txt"
|
| 84 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 85 |
-
f.write(
|
| 86 |
-
|
| 87 |
# 3. Markdown file
|
| 88 |
md_path = f"{base_filename}.md"
|
| 89 |
with open(md_path, "w", encoding="utf-8") as f:
|
| 90 |
-
f.write(
|
| 91 |
-
|
| 92 |
# 4. HTML file
|
| 93 |
html_path = f"{base_filename}.html"
|
| 94 |
-
html_content = result.document.export_to_html()
|
| 95 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 96 |
-
f.write(
|
| 97 |
-
|
| 98 |
-
success_message =
|
| 99 |
return docling_json_path, txt_path, md_path, html_path, success_message
|
| 100 |
|
| 101 |
elif ext == ".txt":
|
| 102 |
# For plain text files, create all formats
|
| 103 |
with open(path, "r", encoding="utf-8") as f:
|
| 104 |
text_content = f.read()
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
| 106 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 107 |
base_filename = f"document_{timestamp}"
|
| 108 |
-
|
| 109 |
# 1. Docling document (JSON) - simple structure for text
|
| 110 |
docling_json_path = f"{base_filename}_docling.json"
|
| 111 |
docling_dict = {
|
|
@@ -118,17 +171,17 @@ def process_file(file):
|
|
| 118 |
}
|
| 119 |
with open(docling_json_path, "w", encoding="utf-8") as f:
|
| 120 |
json.dump(docling_dict, f, indent=2, ensure_ascii=False)
|
| 121 |
-
|
| 122 |
# 2. Text file
|
| 123 |
txt_path = f"{base_filename}.txt"
|
| 124 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 125 |
f.write(text_content)
|
| 126 |
-
|
| 127 |
# 3. Markdown file
|
| 128 |
md_path = f"{base_filename}.md"
|
| 129 |
with open(md_path, "w", encoding="utf-8") as f:
|
| 130 |
f.write(f"# Document\n\n{text_content}")
|
| 131 |
-
|
| 132 |
# 4. HTML file
|
| 133 |
html_path = f"{base_filename}.html"
|
| 134 |
html_content = f"""<!DOCTYPE html>
|
|
@@ -144,8 +197,8 @@ def process_file(file):
|
|
| 144 |
</html>"""
|
| 145 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 146 |
f.write(html_content)
|
| 147 |
-
|
| 148 |
-
success_message =
|
| 149 |
return docling_json_path, txt_path, md_path, html_path, success_message
|
| 150 |
|
| 151 |
else:
|
|
@@ -156,15 +209,12 @@ def process_file(file):
|
|
| 156 |
error_message = f"❌ Error processing file: {str(e)}"
|
| 157 |
return None, None, None, None, error_message
|
| 158 |
|
| 159 |
-
|
| 160 |
def reset_form():
|
| 161 |
"""Reset the form"""
|
| 162 |
return None, None, None, None, None, ""
|
| 163 |
|
| 164 |
-
|
| 165 |
# Gradio Interface
|
| 166 |
with gr.Blocks(title="LLM-Ready Document Converter") as app:
|
| 167 |
-
|
| 168 |
gr.Markdown("# 📄 LLM-Ready Document Converter")
|
| 169 |
gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
|
| 170 |
gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
|
|
@@ -175,10 +225,10 @@ with gr.Blocks(title="LLM-Ready Document Converter") as app:
|
|
| 175 |
label="Upload Document",
|
| 176 |
file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
|
| 177 |
)
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
|
| 183 |
status_output = gr.Markdown(label="Status")
|
| 184 |
|
|
@@ -206,8 +256,5 @@ with gr.Blocks(title="LLM-Ready Document Converter") as app:
|
|
| 206 |
outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
|
| 207 |
)
|
| 208 |
|
| 209 |
-
|
| 210 |
if __name__ == "__main__":
|
| 211 |
-
app.launch(
|
| 212 |
-
share=True
|
| 213 |
-
)
|
|
|
|
| 4 |
from datetime import datetime
|
| 5 |
import pandas as pd
|
| 6 |
import json
|
| 7 |
+
import unicodedata
|
| 8 |
|
| 9 |
# Import DocLing and necessary configuration classes
|
| 10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 11 |
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
| 12 |
from docling.datamodel.base_models import InputFormat
|
| 13 |
|
| 14 |
+
# --- Language detection and normalization helpers ---
|
| 15 |
+
try:
|
| 16 |
+
from ftfy import fix_text
|
| 17 |
+
def _fix_text(s: str) -> str:
|
| 18 |
+
return fix_text(s)
|
| 19 |
+
except ImportError:
|
| 20 |
+
def _fix_text(s: str) -> str:
|
| 21 |
+
return s
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from langdetect import detect, DetectorFactory
|
| 25 |
+
DetectorFactory.seed = 0 # deterministic
|
| 26 |
+
def _detect_lang(text: str) -> str | None:
|
| 27 |
+
try:
|
| 28 |
+
return detect(text)
|
| 29 |
+
except Exception:
|
| 30 |
+
return None
|
| 31 |
+
except ImportError:
|
| 32 |
+
def _detect_lang(text: str) -> str | None:
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
def normalize_text(s: str) -> str:
|
| 36 |
+
s = _fix_text(s)
|
| 37 |
+
return unicodedata.normalize("NFC", s)
|
| 38 |
+
|
| 39 |
+
# Map ISO-ish lang codes to Tesseract codes
|
| 40 |
+
LANG_MAP = {
|
| 41 |
+
"pt": "por", "es": "spa", "en": "eng", "fr": "fra", "de": "deu", "it": "ita",
|
| 42 |
+
"nl": "nld", "pl": "pol", "tr": "tur", "cs": "ces", "ru": "rus", "uk": "ukr",
|
| 43 |
+
"el": "ell", "ro": "ron", "hu": "hun", "sv": "swe", "da": "dan", "fi": "fin",
|
| 44 |
+
"no": "nor", "ca": "cat", "gl": "glg"
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
def guess_lang_code(text: str) -> str | None:
|
| 48 |
+
lang = _detect_lang(text) if text and text.strip() else None
|
| 49 |
+
return LANG_MAP.get(lang) if lang else None
|
| 50 |
+
|
| 51 |
+
def looks_garbled(text: str) -> bool:
|
| 52 |
+
if not text or len(text.strip()) < 100:
|
| 53 |
+
return True
|
| 54 |
+
# Common mojibake signs
|
| 55 |
+
bad_patterns = ["Ã", "Â", "�", "ª"]
|
| 56 |
+
return sum(text.count(p) for p in bad_patterns) > 5
|
| 57 |
+
# --- End helpers ---
|
| 58 |
+
|
| 59 |
# --- START OF OCR CONFIGURATION ---
|
| 60 |
+
# Default: do_ocr=False (use native text layer). When OCR is needed, we'll build options dynamically.
|
| 61 |
pdf_options = PdfPipelineOptions(
|
| 62 |
do_ocr=False,
|
| 63 |
+
ocr_model="tesseract"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
)
|
| 65 |
+
format_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
docling_converter = DocumentConverter(format_options=format_options)
|
| 67 |
# --- END OF OCR CONFIGURATION ---
|
| 68 |
|
| 69 |
+
def convert_with_strategy(path: str):
|
| 70 |
+
# 1) No-OCR pass
|
| 71 |
+
no_ocr_opts = PdfPipelineOptions(do_ocr=False, ocr_model="tesseract")
|
| 72 |
+
converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=no_ocr_opts)})
|
| 73 |
+
res = converter.convert(path)
|
| 74 |
+
text_sample = normalize_text(res.document.export_to_text())
|
| 75 |
+
|
| 76 |
+
if not looks_garbled(text_sample):
|
| 77 |
+
return res
|
| 78 |
+
|
| 79 |
+
# 2) OCR fallback with detected language (default to Portuguese)
|
| 80 |
+
detected = guess_lang_code(text_sample) or "por"
|
| 81 |
+
ocr_opts = PdfPipelineOptions(do_ocr=True, ocr_model="tesseract", ocr_languages=[detected])
|
| 82 |
+
ocr_converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=ocr_opts)})
|
| 83 |
+
return ocr_converter.convert(path)
|
| 84 |
|
| 85 |
def process_file(file):
|
| 86 |
"""
|
|
|
|
| 110 |
|
| 111 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
| 112 |
df.to_excel(tmp.name, index=False)
|
| 113 |
+
path = tmp.name
|
| 114 |
|
| 115 |
# Process with DocLing
|
| 116 |
if ext in docling_direct or ext in to_xlsx_first:
|
| 117 |
+
result = convert_with_strategy(path)
|
| 118 |
+
|
| 119 |
# Generate timestamp for filenames
|
| 120 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 121 |
base_filename = f"document_{timestamp}"
|
| 122 |
+
|
| 123 |
# 1. Docling document (JSON)
|
| 124 |
docling_json_path = f"{base_filename}_docling.json"
|
| 125 |
with open(docling_json_path, "w", encoding="utf-8") as f:
|
| 126 |
json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False)
|
| 127 |
+
|
| 128 |
+
# Normalize outputs
|
| 129 |
+
text_out = normalize_text(result.document.export_to_text())
|
| 130 |
+
md_out = normalize_text(result.document.export_to_markdown())
|
| 131 |
+
html_out = normalize_text(result.document.export_to_html())
|
| 132 |
+
|
| 133 |
# 2. Text file
|
| 134 |
txt_path = f"{base_filename}.txt"
|
| 135 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 136 |
+
f.write(text_out)
|
| 137 |
+
|
| 138 |
# 3. Markdown file
|
| 139 |
md_path = f"{base_filename}.md"
|
| 140 |
with open(md_path, "w", encoding="utf-8") as f:
|
| 141 |
+
f.write(md_out)
|
| 142 |
+
|
| 143 |
# 4. HTML file
|
| 144 |
html_path = f"{base_filename}.html"
|
|
|
|
| 145 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 146 |
+
f.write(html_out)
|
| 147 |
+
|
| 148 |
+
success_message = "✅ Successfully processed file! 4 files generated."
|
| 149 |
return docling_json_path, txt_path, md_path, html_path, success_message
|
| 150 |
|
| 151 |
elif ext == ".txt":
|
| 152 |
# For plain text files, create all formats
|
| 153 |
with open(path, "r", encoding="utf-8") as f:
|
| 154 |
text_content = f.read()
|
| 155 |
+
|
| 156 |
+
# Normalize input text as requested
|
| 157 |
+
text_content = normalize_text(text_content)
|
| 158 |
+
|
| 159 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 160 |
base_filename = f"document_{timestamp}"
|
| 161 |
+
|
| 162 |
# 1. Docling document (JSON) - simple structure for text
|
| 163 |
docling_json_path = f"{base_filename}_docling.json"
|
| 164 |
docling_dict = {
|
|
|
|
| 171 |
}
|
| 172 |
with open(docling_json_path, "w", encoding="utf-8") as f:
|
| 173 |
json.dump(docling_dict, f, indent=2, ensure_ascii=False)
|
| 174 |
+
|
| 175 |
# 2. Text file
|
| 176 |
txt_path = f"{base_filename}.txt"
|
| 177 |
with open(txt_path, "w", encoding="utf-8") as f:
|
| 178 |
f.write(text_content)
|
| 179 |
+
|
| 180 |
# 3. Markdown file
|
| 181 |
md_path = f"{base_filename}.md"
|
| 182 |
with open(md_path, "w", encoding="utf-8") as f:
|
| 183 |
f.write(f"# Document\n\n{text_content}")
|
| 184 |
+
|
| 185 |
# 4. HTML file
|
| 186 |
html_path = f"{base_filename}.html"
|
| 187 |
html_content = f"""<!DOCTYPE html>
|
|
|
|
| 197 |
</html>"""
|
| 198 |
with open(html_path, "w", encoding="utf-8") as f:
|
| 199 |
f.write(html_content)
|
| 200 |
+
|
| 201 |
+
success_message = "✅ Successfully processed text file! 4 files generated."
|
| 202 |
return docling_json_path, txt_path, md_path, html_path, success_message
|
| 203 |
|
| 204 |
else:
|
|
|
|
| 209 |
error_message = f"❌ Error processing file: {str(e)}"
|
| 210 |
return None, None, None, None, error_message
|
| 211 |
|
|
|
|
| 212 |
def reset_form():
|
| 213 |
"""Reset the form"""
|
| 214 |
return None, None, None, None, None, ""
|
| 215 |
|
|
|
|
| 216 |
# Gradio Interface
|
| 217 |
with gr.Blocks(title="LLM-Ready Document Converter") as app:
|
|
|
|
| 218 |
gr.Markdown("# 📄 LLM-Ready Document Converter")
|
| 219 |
gr.Markdown("**HOWTO** : Upload a document and get 4 output files: Docling JSON, TXT, Markdown, and HTML")
|
| 220 |
gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) into structured, machine-readable outputs optimized for Large Language Models (LLMs). It extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.")
|
|
|
|
| 225 |
label="Upload Document",
|
| 226 |
file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx"]
|
| 227 |
)
|
| 228 |
+
|
| 229 |
+
with gr.Row():
|
| 230 |
+
submit_btn = gr.Button("Convert Document", variant="primary")
|
| 231 |
+
reset_btn = gr.Button("Reset")
|
| 232 |
|
| 233 |
status_output = gr.Markdown(label="Status")
|
| 234 |
|
|
|
|
| 256 |
outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output]
|
| 257 |
)
|
| 258 |
|
|
|
|
| 259 |
if __name__ == "__main__":
|
| 260 |
+
app.launch(share=True)
|
|
|
|
|
|