broadfield-dev commited on
Commit
21d341d
·
verified ·
1 Parent(s): 6a6f272

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -100
app.py CHANGED
@@ -9,10 +9,7 @@ import subprocess
9
  from datetime import datetime
10
  from pathlib import Path
11
  from huggingface_hub import HfApi
12
- from transformers import AutoConfig, AutoModel, AutoTokenizer
13
- from optimum.onnxruntime import ORTQuantizer
14
- from optimum.onnxruntime.configuration import AutoQuantizationConfig
15
- import torch.nn.utils.prune as prune
16
 
17
  # --- SETUP ---
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -29,14 +26,18 @@ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
  LLAMA_CPP_DIR = Path("llama.cpp")
30
 
31
  def setup_llama_cpp():
32
- """Clones llama.cpp if not already present."""
33
  if not LLAMA_CPP_DIR.exists():
34
  logging.info("Cloning llama.cpp repository...")
35
  try:
36
- subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True)
37
  logging.info("llama.cpp cloned successfully.")
 
 
 
 
38
  except subprocess.CalledProcessError as e:
39
- error_msg = f"Failed to clone llama.cpp. This is required for GGUF conversion. Error: {e.stderr.decode()}"
40
  logging.error(error_msg, exc_info=True)
41
  raise RuntimeError(error_msg)
42
 
@@ -44,14 +45,9 @@ def setup_llama_cpp():
44
  try:
45
  setup_llama_cpp()
46
  LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
47
- # Note: llama.cpp's quantize script is also a python script now in many versions
48
- LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize.py"
49
- if not LLAMA_CPP_QUANTIZE_SCRIPT.exists(): # Fallback for older versions with compiled binary
50
- LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize"
51
- # Attempt to build if not found
52
- if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
53
- subprocess.run(["make", "-C", "llama.cpp", "quantize"], check=True, capture_output=True)
54
-
55
  except Exception as e:
56
  logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
57
  # The app will likely fail to start, which is appropriate.
@@ -75,63 +71,68 @@ def stage_1_analyze_model(model_id: str):
75
  logging.error(error_msg)
76
  return log_stream + error_msg, "Could not analyze model.", gr.Accordion(open=False)
77
 
78
- def stage_2_prune_model(model, prune_percentage: float):
79
- if prune_percentage == 0:
80
- return model, "Skipped pruning as percentage was 0."
81
- log_stream = "[STAGE 2] Pruning model...\n"
82
- for name, module in model.named_modules():
83
- if isinstance(module, torch.nn.Linear):
84
- prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
85
- prune.remove(module, 'weight')
86
- log_stream += f"Pruning complete with {prune_percentage}% target.\n"
87
- return model, log_stream
88
-
89
- def stage_3_4_onnx_quantize(model_path: str, calibration_data_path: str):
90
- log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
91
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
92
- model_name = os.path.basename(model_path)
93
- onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
94
 
95
  try:
96
- log_stream += "Executing `optimum-cli export onnx`...\n"
97
- export_command = ["optimum-cli", "export", "onnx", "--model", model_path, "--trust-remote-code", onnx_path]
98
  process = subprocess.run(export_command, check=True, capture_output=True, text=True)
99
  log_stream += process.stdout
100
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
101
- log_stream += f"Successfully exported to ONNX at: {onnx_path}\n"
102
  except subprocess.CalledProcessError as e:
103
  raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
104
 
105
  try:
106
- quantizer = ORTQuantizer.from_pretrained(onnx_path)
107
- if calibration_data_path:
108
- log_stream += "Performing STATIC quantization...\n"
109
- dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=True, per_channel=False)
110
- quantized_path = os.path.join(onnx_path, "quantized-static")
111
- quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig, calibration_dataset=quantizer.get_calibration_dataset("text", dataset_args={"path": calibration_data_path, "split": "train"}, num_samples=100))
 
 
 
 
 
112
  else:
113
- log_stream += "Performing DYNAMIC quantization...\n"
114
- dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
115
- quantized_path = os.path.join(onnx_path, "quantized-dynamic")
116
- quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
117
- log_stream += f"Successfully quantized model to: {quantized_path}\n"
 
 
 
 
 
 
118
  return quantized_path, log_stream
 
 
119
  except Exception as e:
120
- raise RuntimeError(f"Failed during ONNX quantization step. Error: {e}")
121
 
122
- def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strategy: str):
123
- log_stream = "[STAGE 3 & 4] Converting to GGUF using llama.cpp...\n"
 
124
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
125
- model_name = model_id.replace('/', '_')
126
- gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
127
- os.makedirs(gguf_path, exist_ok=True)
128
 
129
- f16_gguf_path = os.path.join(gguf_path, "model-f16.gguf")
130
- quantized_gguf_path = os.path.join(gguf_path, "model.gguf")
131
 
132
  try:
133
  log_stream += "Executing llama.cpp convert.py script...\n"
134
- convert_command = ["python", str(LLAMA_CPP_CONVERT_SCRIPT), model_path, "--outfile", f16_gguf_path, "--outtype", "f16"]
 
135
  process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
136
  log_stream += process.stdout
137
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
@@ -141,24 +142,23 @@ def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strateg
141
 
142
  if target_quant_name == "F16":
143
  log_stream += "Target is F16, renaming file...\n"
144
- os.rename(f16_gguf_path, quantized_gguf_path)
145
  else:
146
  log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
147
- quantize_cmd_base = [str(LLAMA_CPP_QUANTIZE_SCRIPT)] if LLAMA_CPP_QUANTIZE_SCRIPT.is_file() and os.access(LLAMA_CPP_QUANTIZE_SCRIPT, os.X_OK) else ["python", str(LLAMA_CPP_QUANTIZE_SCRIPT)]
148
- quantize_command = quantize_cmd_base + [f16_gguf_path, quantized_gguf_path, target_quant_name]
149
  process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
150
  log_stream += process.stdout
151
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
152
- os.remove(f16_gguf_path)
153
- return gguf_path, log_stream
154
  except subprocess.CalledProcessError as e:
155
  raise RuntimeError(f"Failed during llama.cpp execution. Error:\n{e.stderr}")
156
  except Exception as e:
157
  raise RuntimeError(f"An unexpected error occurred during GGUF conversion. Error: {e}")
158
 
159
  def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
160
- # This function remains correct and does not need changes
161
- log_stream = "[STAGE 5] Packaging and Uploading...\n"
162
  if not HF_TOKEN:
163
  return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
164
  try:
@@ -166,58 +166,44 @@ def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipelin
166
  repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
167
  template_file = "model_card_template_gguf.md" if options['pipeline_type'] == "GGUF" else "model_card_template.md"
168
  with open(template_file, "r", encoding="utf-8") as f: template_content = f.read()
169
- model_card_content = template_content.format(repo_name=repo_name, model_id=model_id, optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), pruning_status="Enabled" if options.get('prune', False) else "Disabled", pruning_percent=options.get('prune_percent', 0), quant_type=options.get('quant_type', 'N/A'), repo_id=repo_url.repo_id, pipeline_log=pipeline_log)
 
170
  with open(os.path.join(optimized_model_path, "README.md"), "w", encoding="utf-8") as f: f.write(model_card_content)
171
- if options['pipeline_type'] == "ONNX":
172
- AutoTokenizer.from_pretrained(model_id, trust_remote_code=True).save_pretrained(optimized_model_path)
173
  api.upload_folder(folder_path=optimized_model_path, repo_id=repo_url.repo_id, repo_type="model", token=HF_TOKEN)
174
- log_stream += "Upload complete.\n"
175
  return f"Success! Your optimized model is available at: huggingface.co/{repo_url.repo_id}", log_stream
176
  except Exception as e:
177
  raise RuntimeError(f"Failed to upload to the Hub. Error: {e}")
178
 
179
- def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_percent: float, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
 
180
  if not model_id:
181
  yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
182
  return
183
 
184
- initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated.\n"
185
  yield {run_button: gr.Button(interactive=False, value="🚀 Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
186
 
187
  full_log = initial_log
188
- temp_model_dir = None
189
  try:
190
- whoami = api.whoami()
191
  if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
192
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
193
 
194
- full_log += "Loading base model...\n"; yield {final_output: "Loading model (1/5)", log_output: full_log}
195
- model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
196
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
197
- full_log += f"Successfully loaded '{model_id}'.\n"
198
-
199
- full_log += "Pruning model...\n"; yield {final_output: "Pruning model (2/5)", log_output: full_log}
200
- model, log = stage_2_prune_model(model, prune_percent if do_prune else 0)
201
- full_log += log
202
-
203
- temp_model_dir = tempfile.mkdtemp()
204
- model.save_pretrained(temp_model_dir)
205
- tokenizer.save_pretrained(temp_model_dir)
206
- full_log += f"Saved intermediate model to {temp_model_dir}\n"
207
-
208
  if pipeline_type == "ONNX":
209
- full_log += "Converting to ONNX...\n"; yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
210
- optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
211
- options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
212
  elif pipeline_type == "GGUF":
213
- full_log += "Converting to GGUF...\n"; yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
214
- optimized_path, log = stage_3_4_gguf_quantize(temp_model_dir, model_id, gguf_quant_type)
215
- options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
216
  else:
217
  raise ValueError("Invalid pipeline type selected.")
218
  full_log += log
219
 
220
- full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
221
  final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
222
  full_log += log
223
 
@@ -227,8 +213,11 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
227
  full_log += f"\n[ERROR] Pipeline failed: {e}"
228
  yield {final_output: gr.update(value="ERROR", label="Status"), log_output: full_log, success_box: gr.Markdown(f"❌ **An error occurred.** Check logs for details.", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
229
  finally:
230
- if temp_model_dir and os.path.exists(temp_model_dir):
231
- shutil.rmtree(temp_model_dir)
 
 
 
232
 
233
  # --- GRADIO UI ---
234
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -242,12 +231,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
242
  with gr.Accordion("⚙️ 2. Configure Optimization", open=False) as optimization_accordion:
243
  analysis_report_output = gr.Markdown()
244
  pipeline_type_radio = gr.Radio(["ONNX", "GGUF"], label="Select Optimization Pipeline")
245
- prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights.", visible=True)
246
- prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=True)
 
 
247
  with gr.Group(visible=False) as onnx_options:
248
  gr.Markdown("#### ONNX Options")
249
- onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="Quantization Type", value="Dynamic")
250
- calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
 
251
  with gr.Group(visible=False) as gguf_options:
252
  gr.Markdown("#### GGUF Options")
253
  gguf_quant_dropdown = gr.Dropdown(["q4_k_m", "q5_k_m", "q8_0", "f16"], label="Quantization Strategy", value="q4_k_m")
@@ -260,13 +252,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
260
 
261
  def update_ui_for_pipeline(pipeline_type):
262
  return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
263
- def update_ui_for_quant_type(quant_type):
264
- return gr.File(visible=quant_type == "Static")
265
 
266
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
267
- onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
268
  analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
269
- run_button.click(fn=run_amop_pipeline, inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown], outputs=[run_button, analyze_button, final_output, log_output, success_box])
 
 
 
270
 
271
  if __name__ == "__main__":
272
- demo.launch(debug=True)
 
 
9
  from datetime import datetime
10
  from pathlib import Path
11
  from huggingface_hub import HfApi
12
+ from transformers import AutoConfig, AutoTokenizer # Keep AutoTokenizer for ONNX pipeline
 
 
 
13
 
14
  # --- SETUP ---
15
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
26
  LLAMA_CPP_DIR = Path("llama.cpp")
27
 
28
  def setup_llama_cpp():
29
+ """Clones llama.cpp if not already present and builds it."""
30
  if not LLAMA_CPP_DIR.exists():
31
  logging.info("Cloning llama.cpp repository...")
32
  try:
33
+ subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True, text=True)
34
  logging.info("llama.cpp cloned successfully.")
35
+ logging.info("Building llama.cpp...")
36
+ # Build the required tools
37
+ subprocess.run(["make", "-C", "llama.cpp", "quantize", "convert.py"], check=True, capture_output=True, text=True)
38
+ logging.info("llama.cpp built successfully.")
39
  except subprocess.CalledProcessError as e:
40
+ error_msg = f"Failed to clone or build llama.cpp. This is required for GGUF conversion. Error: {e.stderr}"
41
  logging.error(error_msg, exc_info=True)
42
  raise RuntimeError(error_msg)
43
 
 
45
  try:
46
  setup_llama_cpp()
47
  LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
48
+ LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize" # This is a binary, not a python script
49
+ if not LLAMA_CPP_CONVERT_SCRIPT.exists() or not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
50
+ raise RuntimeError("llama.cpp scripts/binaries not found after setup.")
 
 
 
 
 
51
  except Exception as e:
52
  logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
53
  # The app will likely fail to start, which is appropriate.
 
71
  logging.error(error_msg)
72
  return log_stream + error_msg, "Could not analyze model.", gr.Accordion(open=False)
73
 
74
+ def stage_3_4_onnx_quantize(model_id: str, onnx_quant_type: str, calibration_data_path: str):
75
+ # MODIFIED: Takes model_id directly
76
+ log_stream = "[STAGE 2 & 3] Converting to ONNX and Quantizing...\n"
 
 
 
 
 
 
 
 
 
 
77
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
78
+ model_name = model_id.split('/')[-1]
79
+ onnx_base_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx-unquantized")
80
 
81
  try:
82
+ log_stream += f"Executing `optimum-cli export onnx` for model '{model_id}'...\n"
83
+ export_command = ["optimum-cli", "export", "onnx", "--model", model_id, "--trust-remote-code", onnx_base_path]
84
  process = subprocess.run(export_command, check=True, capture_output=True, text=True)
85
  log_stream += process.stdout
86
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
87
+ log_stream += f"Successfully exported to ONNX at: {onnx_base_path}\n"
88
  except subprocess.CalledProcessError as e:
89
  raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
90
 
91
  try:
92
+ log_stream += f"Executing `optimum-cli onnx quantize` for model at '{onnx_base_path}'...\n"
93
+ quantized_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx-quantized")
94
+ quantize_command = ["optimum-cli", "onnx", "quantize", "--onnx_model", onnx_base_path, "--avx512", "-o", quantized_path]
95
+
96
+ if onnx_quant_type == "Static" and calibration_data_path:
97
+ log_stream += "Using STATIC quantization with provided calibration data.\n"
98
+ # NOTE: optimum-cli quantization is more complex for static. This example simplifies to dynamic.
99
+ # For a real implementation, you would need to construct a more complex calibration configuration.
100
+ # For stability in a public space, we'll stick to the more reliable dynamic quantization.
101
+ log_stream += "[WARNING] Static quantization via CLI is complex and not fully implemented in this UI. Falling back to dynamic.\n"
102
+ quantize_command.append("--dynamic")
103
  else:
104
+ log_stream += "Using DYNAMIC quantization...\n"
105
+ quantize_command.append("--dynamic")
106
+
107
+ process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
108
+ log_stream += process.stdout
109
+ if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
110
+
111
+ # Copy tokenizer config
112
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
113
+ tokenizer.save_pretrained(quantized_path)
114
+ log_stream += f"Successfully quantized model and saved tokenizer to: {quantized_path}\n"
115
  return quantized_path, log_stream
116
+ except subprocess.CalledProcessError as e:
117
+ raise RuntimeError(f"Failed during `optimum-cli onnx quantize`. Error:\n{e.stderr}")
118
  except Exception as e:
119
+ raise RuntimeError(f"An unexpected error occurred during ONNX processing. Error: {e}")
120
 
121
+ def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
122
+ # MODIFIED: Takes model_id directly
123
+ log_stream = "[STAGE 2 & 3] Converting to GGUF using llama.cpp...\n"
124
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
125
+ model_name_sanitized = model_id.replace('/', '_')
126
+ gguf_output_dir = os.path.join(OUTPUT_DIR, f"{model_name_sanitized}-{run_id}-gguf")
127
+ os.makedirs(gguf_output_dir, exist_ok=True)
128
 
129
+ f16_gguf_path = os.path.join(gguf_output_dir, "model-f16.gguf")
130
+ final_quantized_gguf_path = os.path.join(gguf_output_dir, "model.gguf")
131
 
132
  try:
133
  log_stream += "Executing llama.cpp convert.py script...\n"
134
+ # The convert script can take the model ID directly and will use the cache
135
+ convert_command = ["python3", str(LLAMA_CPP_CONVERT_SCRIPT), model_id, "--outfile", f16_gguf_path, "--outtype", "f16"]
136
  process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
137
  log_stream += process.stdout
138
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
 
142
 
143
  if target_quant_name == "F16":
144
  log_stream += "Target is F16, renaming file...\n"
145
+ os.rename(f16_gguf_path, final_quantized_gguf_path)
146
  else:
147
  log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
148
+ quantize_command = [str(LLAMA_CPP_QUANTIZE_SCRIPT), f16_gguf_path, final_quantized_gguf_path, target_quant_name]
 
149
  process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
150
  log_stream += process.stdout
151
  if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
152
+ os.remove(f16_gguf_path) # Clean up intermediate file
153
+ return gguf_output_dir, log_stream
154
  except subprocess.CalledProcessError as e:
155
  raise RuntimeError(f"Failed during llama.cpp execution. Error:\n{e.stderr}")
156
  except Exception as e:
157
  raise RuntimeError(f"An unexpected error occurred during GGUF conversion. Error: {e}")
158
 
159
  def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
160
+ # This function remains mostly correct, just updated placeholder for pruning
161
+ log_stream = "[STAGE 4] Packaging and Uploading...\n"
162
  if not HF_TOKEN:
163
  return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
164
  try:
 
166
  repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
167
  template_file = "model_card_template_gguf.md" if options['pipeline_type'] == "GGUF" else "model_card_template.md"
168
  with open(template_file, "r", encoding="utf-8") as f: template_content = f.read()
169
+ # Updated pruning status to be hardcoded as disabled
170
+ model_card_content = template_content.format(repo_name=repo_name, model_id=model_id, optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), pruning_status="Disabled", pruning_percent=0, quant_type=options.get('quant_type', 'N/A'), repo_id=repo_url.repo_id, pipeline_log=pipeline_log)
171
  with open(os.path.join(optimized_model_path, "README.md"), "w", encoding="utf-8") as f: f.write(model_card_content)
 
 
172
  api.upload_folder(folder_path=optimized_model_path, repo_id=repo_url.repo_id, repo_type="model", token=HF_TOKEN)
173
+ log_stream += f"Upload complete to {repo_url.repo_id}.\n"
174
  return f"Success! Your optimized model is available at: huggingface.co/{repo_url.repo_id}", log_stream
175
  except Exception as e:
176
  raise RuntimeError(f"Failed to upload to the Hub. Error: {e}")
177
 
178
+ def run_amop_pipeline(model_id: str, pipeline_type: str, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
179
+ # REFACTORED: Removed pruning and in-memory model loading
180
  if not model_id:
181
  yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
182
  return
183
 
184
+ initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated for model '{model_id}'.\n"
185
  yield {run_button: gr.Button(interactive=False, value="🚀 Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
186
 
187
  full_log = initial_log
 
188
  try:
189
+ whoami = api.whoami(token=HF_TOKEN)
190
  if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
191
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
192
 
193
+ # The pipeline now has fewer, more robust steps
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  if pipeline_type == "ONNX":
195
+ full_log += "Starting ONNX Conversion & Quantization...\n"; yield {final_output: "Converting to ONNX (1/3)", log_output: full_log}
196
+ optimized_path, log = stage_3_4_onnx_quantize(model_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
197
+ options = {'pipeline_type': 'ONNX', 'quant_type': onnx_quant_type}
198
  elif pipeline_type == "GGUF":
199
+ full_log += "Starting GGUF Conversion & Quantization...\n"; yield {final_output: "Converting to GGUF (1/3)", log_output: full_log}
200
+ optimized_path, log = stage_3_4_gguf_quantize(model_id, gguf_quant_type)
201
+ options = {'pipeline_type': 'GGUF', 'quant_type': gguf_quant_type}
202
  else:
203
  raise ValueError("Invalid pipeline type selected.")
204
  full_log += log
205
 
206
+ full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (2/3)", log_output: full_log}
207
  final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
208
  full_log += log
209
 
 
213
  full_log += f"\n[ERROR] Pipeline failed: {e}"
214
  yield {final_output: gr.update(value="ERROR", label="Status"), log_output: full_log, success_box: gr.Markdown(f"❌ **An error occurred.** Check logs for details.", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
215
  finally:
216
+ # Clean up entire output directory to save space
217
+ if os.path.exists(OUTPUT_DIR):
218
+ shutil.rmtree(OUTPUT_DIR)
219
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
220
+
221
 
222
  # --- GRADIO UI ---
223
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
231
  with gr.Accordion("⚙️ 2. Configure Optimization", open=False) as optimization_accordion:
232
  analysis_report_output = gr.Markdown()
233
  pipeline_type_radio = gr.Radio(["ONNX", "GGUF"], label="Select Optimization Pipeline")
234
+ # Pruning is removed for stability on HF Spaces
235
+ # prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights.", visible=True)
236
+ # prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=True)
237
+ gr.Markdown("<p style='color:grey;font-size:0.9em;'>Note: Pruning has been disabled to ensure stability on resource-constrained hardware.</p>")
238
  with gr.Group(visible=False) as onnx_options:
239
  gr.Markdown("#### ONNX Options")
240
+ onnx_quant_radio = gr.Radio(["Dynamic"], label="Quantization Type", value="Dynamic", info="Static quantization is not supported in this version.") # Simplified
241
+ # Hiding calibration for now as it adds complexity
242
+ # calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
243
  with gr.Group(visible=False) as gguf_options:
244
  gr.Markdown("#### GGUF Options")
245
  gguf_quant_dropdown = gr.Dropdown(["q4_k_m", "q5_k_m", "q8_0", "f16"], label="Quantization Strategy", value="q4_k_m")
 
252
 
253
  def update_ui_for_pipeline(pipeline_type):
254
  return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
 
 
255
 
256
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
 
257
  analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
258
+ # MODIFIED: Removed pruning inputs from the click function
259
+ run_button.click(fn=run_amop_pipeline,
260
+ inputs=[model_id_input, pipeline_type_radio, onnx_quant_radio, gr.State(None), gguf_quant_dropdown], # Using gr.State(None) as placeholder for removed file upload
261
+ outputs=[run_button, analyze_button, final_output, log_output, success_box])
262
 
263
  if __name__ == "__main__":
264
+ # IMPORTANT: Added .queue() for handling long-running jobs
265
+ demo.queue().launch(debug=True)