broadfield-dev commited on
Commit
201ed88
·
verified ·
1 Parent(s): 5a1196d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -24
app.py CHANGED
@@ -26,11 +26,11 @@ OUTPUT_DIR = "optimized_models"
26
  os.makedirs(OUTPUT_DIR, exist_ok=True)
27
 
28
  # --- LLAMA.CPP SETUP ---
29
- ## FIX: Define paths at the global scope so all functions can access them.
30
  LLAMA_CPP_DIR = Path("llama.cpp")
31
  LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
32
  LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize" # This is a compiled binary
33
 
 
34
  def setup_llama_cpp():
35
  """Clones and builds llama.cpp if not already present."""
36
  if not LLAMA_CPP_DIR.exists():
@@ -43,14 +43,25 @@ def setup_llama_cpp():
43
  logging.error(error_msg, exc_info=True)
44
  raise RuntimeError(error_msg)
45
 
 
46
  if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
47
- logging.info("llama.cpp 'quantize' binary not found. Attempting to build...")
48
  try:
49
- # Use 'make' to build the necessary tools
50
- subprocess.run(["make", "-C", str(LLAMA_CPP_DIR), "quantize"], check=True, capture_output=True, text=True)
51
- logging.info("'quantize' binary built successfully.")
 
 
 
 
 
 
 
 
 
 
52
  except subprocess.CalledProcessError as e:
53
- error_msg = f"Failed to build llama.cpp 'quantize' binary. Error: {e.stderr}"
54
  logging.error(error_msg, exc_info=True)
55
  raise RuntimeError(error_msg)
56
 
@@ -79,7 +90,6 @@ def stage_1_analyze_model(model_id: str):
79
  logging.error(error_msg)
80
  return log_stream + error_msg, "Could not analyze model.", gr.Accordion(open=False)
81
 
82
- ## RE-INTEGRATED: This function is brought back from your original code.
83
  def stage_2_prune_model(model, prune_percentage: float):
84
  if prune_percentage == 0:
85
  return model, "Skipped pruning as percentage was 0."
@@ -108,7 +118,6 @@ def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibra
108
  raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
109
 
110
  try:
111
- # For simplicity and stability on HF Spaces, we will only use Dynamic Quantization via CLI.
112
  quantizer = ORTQuantizer.from_pretrained(onnx_path)
113
  log_stream += "Performing DYNAMIC quantization...\n"
114
  dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
@@ -116,8 +125,6 @@ def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibra
116
  quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
117
  log_stream += f"Successfully quantized model to: {quantized_path}\n"
118
 
119
- # If the original input was a model_id, we need to save a new tokenizer.
120
- # If it was a local path (from pruning), the tokenizer is already there.
121
  if not os.path.exists(os.path.join(quantized_path, 'tokenizer_config.json')):
122
  AutoTokenizer.from_pretrained(model_path_or_id, trust_remote_code=True).save_pretrained(quantized_path)
123
  log_stream += "Saved new tokenizer files.\n"
@@ -129,7 +136,7 @@ def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibra
129
  def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quantization_strategy: str):
130
  log_stream = "[STAGE 3 & 4] Converting to GGUF using llama.cpp...\n"
131
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
132
- model_name = original_model_id.replace('/', '_') # Use original ID for consistent naming
133
  gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
134
  os.makedirs(gguf_path, exist_ok=True)
135
 
@@ -163,7 +170,6 @@ def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quant
163
  raise RuntimeError(f"An unexpected error occurred during GGUF conversion. Error: {e}")
164
 
165
  def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
166
- # This function is correct from your original version
167
  log_stream = "[STAGE 5] Packaging and Uploading...\n"
168
  if not HF_TOKEN:
169
  return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
@@ -180,7 +186,6 @@ def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipelin
180
  except Exception as e:
181
  raise RuntimeError(f"Failed to upload to the Hub. Error: {e}")
182
 
183
- ## RE-INTEGRATED: The main pipeline function now handles both pruning and no-pruning paths.
184
  def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_percent: float, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
185
  if not model_id:
186
  yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
@@ -191,14 +196,13 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
191
 
192
  full_log = initial_log
193
  temp_model_dir = None
194
- model_path_or_id = model_id # Default to memory-efficient path
195
 
196
  try:
197
  whoami = api.whoami(token=HF_TOKEN)
198
  if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
199
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
200
 
201
- # --- STAGE 2: OPTIONAL PRUNING (Memory-intensive) ---
202
  if do_prune and prune_percent > 0:
203
  full_log += f"\n[WARNING] Pruning is memory-intensive and may fail for large models.\n"
204
  full_log += "Loading base model for pruning...\n"; yield {final_output: "Loading model (1/5)", log_output: full_log}
@@ -210,16 +214,14 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
210
  model, log = stage_2_prune_model(model, prune_percent)
211
  full_log += log
212
 
213
- # Save pruned model to a temporary directory for the next stage
214
  temp_model_dir = tempfile.mkdtemp()
215
  model.save_pretrained(temp_model_dir)
216
  tokenizer.save_pretrained(temp_model_dir)
217
- model_path_or_id = temp_model_dir # Next stages will use this local path
218
  full_log += f"Saved intermediate pruned model to {temp_model_dir}\n"
219
  else:
220
  full_log += "Pruning skipped.\n"
221
 
222
- # --- STAGE 3 & 4: CONVERSION & QUANTIZATION ---
223
  if pipeline_type == "ONNX":
224
  full_log += "Converting to ONNX...\n"; yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
225
  optimized_path, log = stage_3_4_onnx_quantize(model_path_or_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
@@ -232,7 +234,6 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
232
  raise ValueError("Invalid pipeline type selected.")
233
  full_log += log
234
 
235
- # --- STAGE 5: UPLOAD ---
236
  full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
237
  final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
238
  full_log += log
@@ -243,7 +244,6 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
243
  full_log += f"\n[ERROR] Pipeline failed: {e}"
244
  yield {final_output: gr.update(value="ERROR", label="Status"), log_output: full_log, success_box: gr.Markdown(f"❌ **An error occurred.** Check logs for details.", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
245
  finally:
246
- # Clean up the temporary directory if it was created
247
  if temp_model_dir and os.path.exists(temp_model_dir):
248
  shutil.rmtree(temp_model_dir)
249
 
@@ -259,14 +259,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
259
  with gr.Accordion("⚙️ 2. Configure Optimization", open=False) as optimization_accordion:
260
  analysis_report_output = gr.Markdown()
261
  pipeline_type_radio = gr.Radio(["ONNX", "GGUF"], label="Select Optimization Pipeline")
262
- ## RE-INTEGRATED: Pruning UI elements are back.
263
  gr.Warning("Pruning requires high RAM and may fail for models >2B parameters on free Spaces.")
264
  prune_checkbox = gr.Checkbox(label="Enable Pruning (Optional)", value=False, info="Removes redundant weights before quantization.")
265
  prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=True)
266
  with gr.Group(visible=False) as onnx_options:
267
  gr.Markdown("#### ONNX Options")
268
  onnx_quant_radio = gr.Radio(["Dynamic"], label="Quantization Type", value="Dynamic", info="Static quantization via UI is not supported.")
269
- calibration_file_upload = gr.File(visible=False) # Keep element for function signature, but hide
270
  with gr.Group(visible=False) as gguf_options:
271
  gr.Markdown("#### GGUF Options")
272
  gguf_quant_dropdown = gr.Dropdown(["q4_k_m", "q5_k_m", "q8_0", "f16"], label="Quantization Strategy", value="q4_k_m")
@@ -283,11 +282,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
283
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
284
  analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
285
 
286
- ## RE-INTEGRATED: Pruning inputs are now passed to the pipeline function.
287
  run_button.click(fn=run_amop_pipeline,
288
  inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
289
  outputs=[run_button, analyze_button, final_output, log_output, success_box])
290
 
291
  if __name__ == "__main__":
292
- # Use .queue() to handle long-running tasks and prevent timeouts
293
  demo.queue().launch(debug=True)
 
26
  os.makedirs(OUTPUT_DIR, exist_ok=True)
27
 
28
  # --- LLAMA.CPP SETUP ---
 
29
  LLAMA_CPP_DIR = Path("llama.cpp")
30
  LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
31
  LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize" # This is a compiled binary
32
 
33
+ ## FIXED FUNCTION: Replaced 'make' with 'cmake' for the build process.
34
  def setup_llama_cpp():
35
  """Clones and builds llama.cpp if not already present."""
36
  if not LLAMA_CPP_DIR.exists():
 
43
  logging.error(error_msg, exc_info=True)
44
  raise RuntimeError(error_msg)
45
 
46
+ # If the binary doesn't exist, try to build it with CMake.
47
  if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
48
+ logging.info("llama.cpp 'quantize' binary not found. Building with CMake...")
49
  try:
50
+ # Step 1: Configure the build directory
51
+ subprocess.run(
52
+ ["cmake", "."],
53
+ cwd=str(LLAMA_CPP_DIR), # Run command inside the llama.cpp directory
54
+ check=True, capture_output=True, text=True
55
+ )
56
+ # Step 2: Build the 'quantize' target
57
+ subprocess.run(
58
+ ["cmake", "--build", ".", "--target", "quantize"],
59
+ cwd=str(LLAMA_CPP_DIR),
60
+ check=True, capture_output=True, text=True
61
+ )
62
+ logging.info("'quantize' binary built successfully with CMake.")
63
  except subprocess.CalledProcessError as e:
64
+ error_msg = f"Failed to build llama.cpp with CMake. Error: {e.stderr}"
65
  logging.error(error_msg, exc_info=True)
66
  raise RuntimeError(error_msg)
67
 
 
90
  logging.error(error_msg)
91
  return log_stream + error_msg, "Could not analyze model.", gr.Accordion(open=False)
92
 
 
93
  def stage_2_prune_model(model, prune_percentage: float):
94
  if prune_percentage == 0:
95
  return model, "Skipped pruning as percentage was 0."
 
118
  raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
119
 
120
  try:
 
121
  quantizer = ORTQuantizer.from_pretrained(onnx_path)
122
  log_stream += "Performing DYNAMIC quantization...\n"
123
  dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
 
125
  quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
126
  log_stream += f"Successfully quantized model to: {quantized_path}\n"
127
 
 
 
128
  if not os.path.exists(os.path.join(quantized_path, 'tokenizer_config.json')):
129
  AutoTokenizer.from_pretrained(model_path_or_id, trust_remote_code=True).save_pretrained(quantized_path)
130
  log_stream += "Saved new tokenizer files.\n"
 
136
  def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quantization_strategy: str):
137
  log_stream = "[STAGE 3 & 4] Converting to GGUF using llama.cpp...\n"
138
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
139
+ model_name = original_model_id.replace('/', '_')
140
  gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
141
  os.makedirs(gguf_path, exist_ok=True)
142
 
 
170
  raise RuntimeError(f"An unexpected error occurred during GGUF conversion. Error: {e}")
171
 
172
  def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
 
173
  log_stream = "[STAGE 5] Packaging and Uploading...\n"
174
  if not HF_TOKEN:
175
  return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
 
186
  except Exception as e:
187
  raise RuntimeError(f"Failed to upload to the Hub. Error: {e}")
188
 
 
189
  def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_percent: float, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
190
  if not model_id:
191
  yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
 
196
 
197
  full_log = initial_log
198
  temp_model_dir = None
199
+ model_path_or_id = model_id
200
 
201
  try:
202
  whoami = api.whoami(token=HF_TOKEN)
203
  if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
204
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
205
 
 
206
  if do_prune and prune_percent > 0:
207
  full_log += f"\n[WARNING] Pruning is memory-intensive and may fail for large models.\n"
208
  full_log += "Loading base model for pruning...\n"; yield {final_output: "Loading model (1/5)", log_output: full_log}
 
214
  model, log = stage_2_prune_model(model, prune_percent)
215
  full_log += log
216
 
 
217
  temp_model_dir = tempfile.mkdtemp()
218
  model.save_pretrained(temp_model_dir)
219
  tokenizer.save_pretrained(temp_model_dir)
220
+ model_path_or_id = temp_model_dir
221
  full_log += f"Saved intermediate pruned model to {temp_model_dir}\n"
222
  else:
223
  full_log += "Pruning skipped.\n"
224
 
 
225
  if pipeline_type == "ONNX":
226
  full_log += "Converting to ONNX...\n"; yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
227
  optimized_path, log = stage_3_4_onnx_quantize(model_path_or_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
 
234
  raise ValueError("Invalid pipeline type selected.")
235
  full_log += log
236
 
 
237
  full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
238
  final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
239
  full_log += log
 
244
  full_log += f"\n[ERROR] Pipeline failed: {e}"
245
  yield {final_output: gr.update(value="ERROR", label="Status"), log_output: full_log, success_box: gr.Markdown(f"❌ **An error occurred.** Check logs for details.", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
246
  finally:
 
247
  if temp_model_dir and os.path.exists(temp_model_dir):
248
  shutil.rmtree(temp_model_dir)
249
 
 
259
  with gr.Accordion("⚙️ 2. Configure Optimization", open=False) as optimization_accordion:
260
  analysis_report_output = gr.Markdown()
261
  pipeline_type_radio = gr.Radio(["ONNX", "GGUF"], label="Select Optimization Pipeline")
 
262
  gr.Warning("Pruning requires high RAM and may fail for models >2B parameters on free Spaces.")
263
  prune_checkbox = gr.Checkbox(label="Enable Pruning (Optional)", value=False, info="Removes redundant weights before quantization.")
264
  prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=True)
265
  with gr.Group(visible=False) as onnx_options:
266
  gr.Markdown("#### ONNX Options")
267
  onnx_quant_radio = gr.Radio(["Dynamic"], label="Quantization Type", value="Dynamic", info="Static quantization via UI is not supported.")
268
+ calibration_file_upload = gr.File(visible=False)
269
  with gr.Group(visible=False) as gguf_options:
270
  gr.Markdown("#### GGUF Options")
271
  gguf_quant_dropdown = gr.Dropdown(["q4_k_m", "q5_k_m", "q8_0", "f16"], label="Quantization Strategy", value="q4_k_m")
 
282
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
283
  analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
284
 
 
285
  run_button.click(fn=run_amop_pipeline,
286
  inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
287
  outputs=[run_button, analyze_button, final_output, log_output, success_box])
288
 
289
  if __name__ == "__main__":
 
290
  demo.queue().launch(debug=True)