broadfield-dev commited on
Commit
22406c4
Β·
verified Β·
1 Parent(s): de1d3bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -44
app.py CHANGED
@@ -14,7 +14,6 @@ from optimum.onnxruntime import ORTQuantizer
14
  from optimum.onnxruntime.configuration import AutoQuantizationConfig
15
  import torch.nn.utils.prune as prune
16
 
17
- # --- SETUP ---
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
 
20
  HF_TOKEN = os.getenv("HF_TOKEN")
@@ -25,23 +24,27 @@ api = HfApi()
25
  OUTPUT_DIR = "optimized_models"
26
  os.makedirs(OUTPUT_DIR, exist_ok=True)
27
 
28
- # --- LLAMA.CPP SETUP ---
29
  LLAMA_CPP_DIR = Path("llama.cpp")
30
- ## FINAL FIX: The correct, stable script is in the 'tools' subdirectory.
31
- LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "tools" / "convert-hf-to-gguf.py"
32
  LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize"
33
 
34
  def setup_llama_cpp():
35
- """Clones and builds llama.cpp if not already present."""
36
  if not LLAMA_CPP_DIR.exists():
37
  logging.info("Cloning llama.cpp repository...")
38
  try:
39
  subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True, text=True)
40
  logging.info("llama.cpp cloned successfully.")
41
  except subprocess.CalledProcessError as e:
42
- error_msg = f"Failed to clone llama.cpp. Error: {e.stderr}"
43
- logging.error(error_msg, exc_info=True)
44
- raise RuntimeError(error_msg)
 
 
 
 
 
 
 
45
 
46
  if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
47
  logging.info("llama.cpp 'quantize' binary not found. Building with CMake...")
@@ -50,17 +53,13 @@ def setup_llama_cpp():
50
  subprocess.run(["cmake", "--build", ".", "--target", "quantize"], cwd=str(LLAMA_CPP_DIR), check=True, capture_output=True, text=True)
51
  logging.info("'quantize' binary built successfully with CMake.")
52
  except subprocess.CalledProcessError as e:
53
- error_msg = f"Failed to build llama.cpp with CMake. Error: {e.stderr}"
54
- logging.error(error_msg, exc_info=True)
55
- raise RuntimeError(error_msg)
56
 
57
- # Run setup on script start
58
  try:
59
  setup_llama_cpp()
60
  except Exception as e:
61
  logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
62
 
63
-
64
  def stage_1_analyze_model(model_id: str):
65
  log_stream = "[STAGE 1] Analyzing model...\n"
66
  try:
@@ -95,17 +94,13 @@ def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibra
95
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
96
  model_name = model_path_or_id.split('/')[-1]
97
  onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
98
-
99
  try:
100
- log_stream += f"Executing `optimum-cli export onnx` for '{model_path_or_id}'...\n"
101
  export_command = ["optimum-cli", "export", "onnx", "--model", model_path_or_id, "--trust-remote-code", onnx_path]
102
  process = subprocess.run(export_command, check=True, capture_output=True, text=True)
103
- log_stream += process.stdout
104
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
105
- log_stream += f"Successfully exported to ONNX at: {onnx_path}\n"
106
  except subprocess.CalledProcessError as e:
107
  raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
108
-
109
  try:
110
  quantizer = ORTQuantizer.from_pretrained(onnx_path)
111
  log_stream += "Performing DYNAMIC quantization...\n"
@@ -113,11 +108,9 @@ def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibra
113
  quantized_path = os.path.join(onnx_path, "quantized-dynamic")
114
  quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
115
  log_stream += f"Successfully quantized model to: {quantized_path}\n"
116
-
117
  if not os.path.exists(os.path.join(quantized_path, 'tokenizer_config.json')):
118
  AutoTokenizer.from_pretrained(model_path_or_id, trust_remote_code=True).save_pretrained(quantized_path)
119
  log_stream += "Saved new tokenizer files.\n"
120
-
121
  return quantized_path, log_stream
122
  except Exception as e:
123
  raise RuntimeError(f"Failed during ONNX quantization step. Error: {e}")
@@ -128,20 +121,15 @@ def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quant
128
  model_name = original_model_id.replace('/', '_')
129
  gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
130
  os.makedirs(gguf_path, exist_ok=True)
131
-
132
  f16_gguf_path = os.path.join(gguf_path, "model-f16.gguf")
133
  quantized_gguf_path = os.path.join(gguf_path, "model.gguf")
134
-
135
  try:
136
- log_stream += f"Executing llama.cpp conversion script on '{model_path_or_id}'...\n"
137
  convert_command = ["python3", str(LLAMA_CPP_CONVERT_SCRIPT), model_path_or_id, "--outfile", f16_gguf_path, "--outtype", "f16"]
138
  process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
139
- log_stream += process.stdout
140
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
141
-
142
  quantize_map = {"q4_k_m": "Q4_K_M", "q5_k_m": "Q5_K_M", "q8_0": "Q8_0", "f16": "F16"}
143
  target_quant_name = quantize_map.get(quantization_strategy.lower(), "Q4_K_M")
144
-
145
  if target_quant_name == "F16":
146
  log_stream += "Target is F16, renaming file...\n"
147
  os.rename(f16_gguf_path, quantized_gguf_path)
@@ -149,7 +137,7 @@ def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quant
149
  log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
150
  quantize_command = [str(LLAMA_CPP_QUANTIZE_SCRIPT), f16_gguf_path, quantized_gguf_path, target_quant_name]
151
  process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
152
- log_stream += process.stdout
153
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
154
  os.remove(f16_gguf_path)
155
  return gguf_path, log_stream
@@ -179,30 +167,24 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
179
  if not model_id:
180
  yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
181
  return
182
-
183
  initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated for '{model_id}'.\n"
184
  yield {run_button: gr.Button(interactive=False, value="πŸš€ Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
185
-
186
  full_log = initial_log
187
  temp_model_dir = None
188
  model_path_or_id = model_id
189
-
190
  try:
191
  whoami = api.whoami(token=HF_TOKEN)
192
  if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
193
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
194
-
195
  if do_prune and prune_percent > 0:
196
  full_log += f"\n[WARNING] Pruning is memory-intensive and may fail for large models.\n"
197
- full_log += "Loading base model for pruning...\n"; yield {final_output: "Loading model (1/5)", log_output: full_log}
198
  model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
199
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
200
  full_log += f"Successfully loaded '{model_id}'.\n"
201
-
202
  yield {final_output: "Pruning model (2/5)", log_output: full_log}
203
  model, log = stage_2_prune_model(model, prune_percent)
204
  full_log += log
205
-
206
  temp_model_dir = tempfile.mkdtemp()
207
  model.save_pretrained(temp_model_dir)
208
  tokenizer.save_pretrained(temp_model_dir)
@@ -210,23 +192,20 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
210
  full_log += f"Saved intermediate pruned model to {temp_model_dir}\n"
211
  else:
212
  full_log += "Pruning skipped.\n"
213
-
214
  if pipeline_type == "ONNX":
215
- full_log += "Converting to ONNX...\n"; yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
216
  optimized_path, log = stage_3_4_onnx_quantize(model_path_or_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
217
  options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
218
  elif pipeline_type == "GGUF":
219
- full_log += "Converting to GGUF...\n"; yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
220
  optimized_path, log = stage_3_4_gguf_quantize(model_path_or_id, model_id, gguf_quant_type)
221
  options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
222
  else:
223
  raise ValueError("Invalid pipeline type selected.")
224
  full_log += log
225
-
226
- full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
227
  final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
228
  full_log += log
229
-
230
  yield {final_output: gr.update(value="SUCCESS", label="Status"), log_output: full_log, success_box: gr.Markdown(f"βœ… **Success!** Model available: [{repo_id_for_link}](https://huggingface.co/{repo_id_for_link})", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
231
  except Exception as e:
232
  logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
@@ -236,7 +215,6 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
236
  if temp_model_dir and os.path.exists(temp_model_dir):
237
  shutil.rmtree(temp_model_dir)
238
 
239
- # --- GRADIO UI ---
240
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
241
  gr.Markdown("# πŸš€ AMOP: Adaptive Model Optimization Pipeline")
242
  if not HF_TOKEN: gr.Warning("HF_TOKEN not set! The final 'upload' step will be skipped.")
@@ -264,13 +242,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
264
  final_output = gr.Label(value="Idle", label="Status")
265
  success_box = gr.Markdown(visible=False)
266
  log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
267
-
268
  def update_ui_for_pipeline(pipeline_type):
269
  return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
270
-
271
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
272
  analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
273
-
274
  run_button.click(fn=run_amop_pipeline,
275
  inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
276
  outputs=[run_button, analyze_button, final_output, log_output, success_box])
 
14
  from optimum.onnxruntime.configuration import AutoQuantizationConfig
15
  import torch.nn.utils.prune as prune
16
 
 
17
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
 
19
  HF_TOKEN = os.getenv("HF_TOKEN")
 
24
  OUTPUT_DIR = "optimized_models"
25
  os.makedirs(OUTPUT_DIR, exist_ok=True)
26
 
 
27
  LLAMA_CPP_DIR = Path("llama.cpp")
28
+ LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
 
29
  LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize"
30
 
31
  def setup_llama_cpp():
 
32
  if not LLAMA_CPP_DIR.exists():
33
  logging.info("Cloning llama.cpp repository...")
34
  try:
35
  subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True, text=True)
36
  logging.info("llama.cpp cloned successfully.")
37
  except subprocess.CalledProcessError as e:
38
+ raise RuntimeError(f"Failed to clone llama.cpp. Error: {e.stderr}")
39
+
40
+ requirements_path = LLAMA_CPP_DIR / "requirements.txt"
41
+ if requirements_path.exists():
42
+ logging.info("Installing llama.cpp Python dependencies...")
43
+ try:
44
+ subprocess.run(["pip", "install", "-r", str(requirements_path)], check=True, capture_output=True, text=True)
45
+ logging.info("llama.cpp Python dependencies installed successfully.")
46
+ except subprocess.CalledProcessError as e:
47
+ raise RuntimeError(f"Failed to install llama.cpp requirements. Error: {e.stderr}")
48
 
49
  if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
50
  logging.info("llama.cpp 'quantize' binary not found. Building with CMake...")
 
53
  subprocess.run(["cmake", "--build", ".", "--target", "quantize"], cwd=str(LLAMA_CPP_DIR), check=True, capture_output=True, text=True)
54
  logging.info("'quantize' binary built successfully with CMake.")
55
  except subprocess.CalledProcessError as e:
56
+ raise RuntimeError(f"Failed to build llama.cpp with CMake. Error: {e.stderr}")
 
 
57
 
 
58
  try:
59
  setup_llama_cpp()
60
  except Exception as e:
61
  logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
62
 
 
63
  def stage_1_analyze_model(model_id: str):
64
  log_stream = "[STAGE 1] Analyzing model...\n"
65
  try:
 
94
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
95
  model_name = model_path_or_id.split('/')[-1]
96
  onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
 
97
  try:
 
98
  export_command = ["optimum-cli", "export", "onnx", "--model", model_path_or_id, "--trust-remote-code", onnx_path]
99
  process = subprocess.run(export_command, check=True, capture_output=True, text=True)
100
+ log_stream += f"Executing `optimum-cli export onnx` for '{model_path_or_id}'...\n{process.stdout}\n"
101
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
 
102
  except subprocess.CalledProcessError as e:
103
  raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
 
104
  try:
105
  quantizer = ORTQuantizer.from_pretrained(onnx_path)
106
  log_stream += "Performing DYNAMIC quantization...\n"
 
108
  quantized_path = os.path.join(onnx_path, "quantized-dynamic")
109
  quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
110
  log_stream += f"Successfully quantized model to: {quantized_path}\n"
 
111
  if not os.path.exists(os.path.join(quantized_path, 'tokenizer_config.json')):
112
  AutoTokenizer.from_pretrained(model_path_or_id, trust_remote_code=True).save_pretrained(quantized_path)
113
  log_stream += "Saved new tokenizer files.\n"
 
114
  return quantized_path, log_stream
115
  except Exception as e:
116
  raise RuntimeError(f"Failed during ONNX quantization step. Error: {e}")
 
121
  model_name = original_model_id.replace('/', '_')
122
  gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
123
  os.makedirs(gguf_path, exist_ok=True)
 
124
  f16_gguf_path = os.path.join(gguf_path, "model-f16.gguf")
125
  quantized_gguf_path = os.path.join(gguf_path, "model.gguf")
 
126
  try:
 
127
  convert_command = ["python3", str(LLAMA_CPP_CONVERT_SCRIPT), model_path_or_id, "--outfile", f16_gguf_path, "--outtype", "f16"]
128
  process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
129
+ log_stream += f"Executing llama.cpp conversion script on '{model_path_or_id}'...\n{process.stdout}\n"
130
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
 
131
  quantize_map = {"q4_k_m": "Q4_K_M", "q5_k_m": "Q5_K_M", "q8_0": "Q8_0", "f16": "F16"}
132
  target_quant_name = quantize_map.get(quantization_strategy.lower(), "Q4_K_M")
 
133
  if target_quant_name == "F16":
134
  log_stream += "Target is F16, renaming file...\n"
135
  os.rename(f16_gguf_path, quantized_gguf_path)
 
137
  log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
138
  quantize_command = [str(LLAMA_CPP_QUANTIZE_SCRIPT), f16_gguf_path, quantized_gguf_path, target_quant_name]
139
  process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
140
+ log_stream += f"{process.stdout}\n"
141
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
142
  os.remove(f16_gguf_path)
143
  return gguf_path, log_stream
 
167
  if not model_id:
168
  yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
169
  return
 
170
  initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated for '{model_id}'.\n"
171
  yield {run_button: gr.Button(interactive=False, value="πŸš€ Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
 
172
  full_log = initial_log
173
  temp_model_dir = None
174
  model_path_or_id = model_id
 
175
  try:
176
  whoami = api.whoami(token=HF_TOKEN)
177
  if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
178
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
 
179
  if do_prune and prune_percent > 0:
180
  full_log += f"\n[WARNING] Pruning is memory-intensive and may fail for large models.\n"
181
+ yield {final_output: "Loading model (1/5)", log_output: full_log}
182
  model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
183
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
184
  full_log += f"Successfully loaded '{model_id}'.\n"
 
185
  yield {final_output: "Pruning model (2/5)", log_output: full_log}
186
  model, log = stage_2_prune_model(model, prune_percent)
187
  full_log += log
 
188
  temp_model_dir = tempfile.mkdtemp()
189
  model.save_pretrained(temp_model_dir)
190
  tokenizer.save_pretrained(temp_model_dir)
 
192
  full_log += f"Saved intermediate pruned model to {temp_model_dir}\n"
193
  else:
194
  full_log += "Pruning skipped.\n"
 
195
  if pipeline_type == "ONNX":
196
+ yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
197
  optimized_path, log = stage_3_4_onnx_quantize(model_path_or_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
198
  options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
199
  elif pipeline_type == "GGUF":
200
+ yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
201
  optimized_path, log = stage_3_4_gguf_quantize(model_path_or_id, model_id, gguf_quant_type)
202
  options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
203
  else:
204
  raise ValueError("Invalid pipeline type selected.")
205
  full_log += log
206
+ yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
 
207
  final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
208
  full_log += log
 
209
  yield {final_output: gr.update(value="SUCCESS", label="Status"), log_output: full_log, success_box: gr.Markdown(f"βœ… **Success!** Model available: [{repo_id_for_link}](https://huggingface.co/{repo_id_for_link})", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
210
  except Exception as e:
211
  logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
 
215
  if temp_model_dir and os.path.exists(temp_model_dir):
216
  shutil.rmtree(temp_model_dir)
217
 
 
218
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
219
  gr.Markdown("# πŸš€ AMOP: Adaptive Model Optimization Pipeline")
220
  if not HF_TOKEN: gr.Warning("HF_TOKEN not set! The final 'upload' step will be skipped.")
 
242
  final_output = gr.Label(value="Idle", label="Status")
243
  success_box = gr.Markdown(visible=False)
244
  log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
 
245
  def update_ui_for_pipeline(pipeline_type):
246
  return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
 
247
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
248
  analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
 
249
  run_button.click(fn=run_amop_pipeline,
250
  inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
251
  outputs=[run_button, analyze_button, final_output, log_output, success_box])