Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,7 +14,6 @@ from optimum.onnxruntime import ORTQuantizer
|
|
| 14 |
from optimum.onnxruntime.configuration import AutoQuantizationConfig
|
| 15 |
import torch.nn.utils.prune as prune
|
| 16 |
|
| 17 |
-
# --- SETUP ---
|
| 18 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 19 |
|
| 20 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
@@ -25,23 +24,27 @@ api = HfApi()
|
|
| 25 |
OUTPUT_DIR = "optimized_models"
|
| 26 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 27 |
|
| 28 |
-
# --- LLAMA.CPP SETUP ---
|
| 29 |
LLAMA_CPP_DIR = Path("llama.cpp")
|
| 30 |
-
|
| 31 |
-
LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "tools" / "convert-hf-to-gguf.py"
|
| 32 |
LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize"
|
| 33 |
|
| 34 |
def setup_llama_cpp():
|
| 35 |
-
"""Clones and builds llama.cpp if not already present."""
|
| 36 |
if not LLAMA_CPP_DIR.exists():
|
| 37 |
logging.info("Cloning llama.cpp repository...")
|
| 38 |
try:
|
| 39 |
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True, text=True)
|
| 40 |
logging.info("llama.cpp cloned successfully.")
|
| 41 |
except subprocess.CalledProcessError as e:
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
|
| 47 |
logging.info("llama.cpp 'quantize' binary not found. Building with CMake...")
|
|
@@ -50,17 +53,13 @@ def setup_llama_cpp():
|
|
| 50 |
subprocess.run(["cmake", "--build", ".", "--target", "quantize"], cwd=str(LLAMA_CPP_DIR), check=True, capture_output=True, text=True)
|
| 51 |
logging.info("'quantize' binary built successfully with CMake.")
|
| 52 |
except subprocess.CalledProcessError as e:
|
| 53 |
-
|
| 54 |
-
logging.error(error_msg, exc_info=True)
|
| 55 |
-
raise RuntimeError(error_msg)
|
| 56 |
|
| 57 |
-
# Run setup on script start
|
| 58 |
try:
|
| 59 |
setup_llama_cpp()
|
| 60 |
except Exception as e:
|
| 61 |
logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
|
| 62 |
|
| 63 |
-
|
| 64 |
def stage_1_analyze_model(model_id: str):
|
| 65 |
log_stream = "[STAGE 1] Analyzing model...\n"
|
| 66 |
try:
|
|
@@ -95,17 +94,13 @@ def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibra
|
|
| 95 |
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 96 |
model_name = model_path_or_id.split('/')[-1]
|
| 97 |
onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
|
| 98 |
-
|
| 99 |
try:
|
| 100 |
-
log_stream += f"Executing `optimum-cli export onnx` for '{model_path_or_id}'...\n"
|
| 101 |
export_command = ["optimum-cli", "export", "onnx", "--model", model_path_or_id, "--trust-remote-code", onnx_path]
|
| 102 |
process = subprocess.run(export_command, check=True, capture_output=True, text=True)
|
| 103 |
-
log_stream += process.stdout
|
| 104 |
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
|
| 105 |
-
log_stream += f"Successfully exported to ONNX at: {onnx_path}\n"
|
| 106 |
except subprocess.CalledProcessError as e:
|
| 107 |
raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
|
| 108 |
-
|
| 109 |
try:
|
| 110 |
quantizer = ORTQuantizer.from_pretrained(onnx_path)
|
| 111 |
log_stream += "Performing DYNAMIC quantization...\n"
|
|
@@ -113,11 +108,9 @@ def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibra
|
|
| 113 |
quantized_path = os.path.join(onnx_path, "quantized-dynamic")
|
| 114 |
quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
|
| 115 |
log_stream += f"Successfully quantized model to: {quantized_path}\n"
|
| 116 |
-
|
| 117 |
if not os.path.exists(os.path.join(quantized_path, 'tokenizer_config.json')):
|
| 118 |
AutoTokenizer.from_pretrained(model_path_or_id, trust_remote_code=True).save_pretrained(quantized_path)
|
| 119 |
log_stream += "Saved new tokenizer files.\n"
|
| 120 |
-
|
| 121 |
return quantized_path, log_stream
|
| 122 |
except Exception as e:
|
| 123 |
raise RuntimeError(f"Failed during ONNX quantization step. Error: {e}")
|
|
@@ -128,20 +121,15 @@ def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quant
|
|
| 128 |
model_name = original_model_id.replace('/', '_')
|
| 129 |
gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
|
| 130 |
os.makedirs(gguf_path, exist_ok=True)
|
| 131 |
-
|
| 132 |
f16_gguf_path = os.path.join(gguf_path, "model-f16.gguf")
|
| 133 |
quantized_gguf_path = os.path.join(gguf_path, "model.gguf")
|
| 134 |
-
|
| 135 |
try:
|
| 136 |
-
log_stream += f"Executing llama.cpp conversion script on '{model_path_or_id}'...\n"
|
| 137 |
convert_command = ["python3", str(LLAMA_CPP_CONVERT_SCRIPT), model_path_or_id, "--outfile", f16_gguf_path, "--outtype", "f16"]
|
| 138 |
process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
|
| 139 |
-
log_stream += process.stdout
|
| 140 |
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
|
| 141 |
-
|
| 142 |
quantize_map = {"q4_k_m": "Q4_K_M", "q5_k_m": "Q5_K_M", "q8_0": "Q8_0", "f16": "F16"}
|
| 143 |
target_quant_name = quantize_map.get(quantization_strategy.lower(), "Q4_K_M")
|
| 144 |
-
|
| 145 |
if target_quant_name == "F16":
|
| 146 |
log_stream += "Target is F16, renaming file...\n"
|
| 147 |
os.rename(f16_gguf_path, quantized_gguf_path)
|
|
@@ -149,7 +137,7 @@ def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quant
|
|
| 149 |
log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
|
| 150 |
quantize_command = [str(LLAMA_CPP_QUANTIZE_SCRIPT), f16_gguf_path, quantized_gguf_path, target_quant_name]
|
| 151 |
process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
|
| 152 |
-
log_stream += process.stdout
|
| 153 |
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
|
| 154 |
os.remove(f16_gguf_path)
|
| 155 |
return gguf_path, log_stream
|
|
@@ -179,30 +167,24 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
|
|
| 179 |
if not model_id:
|
| 180 |
yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
|
| 181 |
return
|
| 182 |
-
|
| 183 |
initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated for '{model_id}'.\n"
|
| 184 |
yield {run_button: gr.Button(interactive=False, value="π Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
|
| 185 |
-
|
| 186 |
full_log = initial_log
|
| 187 |
temp_model_dir = None
|
| 188 |
model_path_or_id = model_id
|
| 189 |
-
|
| 190 |
try:
|
| 191 |
whoami = api.whoami(token=HF_TOKEN)
|
| 192 |
if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
|
| 193 |
repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
|
| 194 |
-
|
| 195 |
if do_prune and prune_percent > 0:
|
| 196 |
full_log += f"\n[WARNING] Pruning is memory-intensive and may fail for large models.\n"
|
| 197 |
-
|
| 198 |
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
|
| 199 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 200 |
full_log += f"Successfully loaded '{model_id}'.\n"
|
| 201 |
-
|
| 202 |
yield {final_output: "Pruning model (2/5)", log_output: full_log}
|
| 203 |
model, log = stage_2_prune_model(model, prune_percent)
|
| 204 |
full_log += log
|
| 205 |
-
|
| 206 |
temp_model_dir = tempfile.mkdtemp()
|
| 207 |
model.save_pretrained(temp_model_dir)
|
| 208 |
tokenizer.save_pretrained(temp_model_dir)
|
|
@@ -210,23 +192,20 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
|
|
| 210 |
full_log += f"Saved intermediate pruned model to {temp_model_dir}\n"
|
| 211 |
else:
|
| 212 |
full_log += "Pruning skipped.\n"
|
| 213 |
-
|
| 214 |
if pipeline_type == "ONNX":
|
| 215 |
-
|
| 216 |
optimized_path, log = stage_3_4_onnx_quantize(model_path_or_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
|
| 217 |
options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
|
| 218 |
elif pipeline_type == "GGUF":
|
| 219 |
-
|
| 220 |
optimized_path, log = stage_3_4_gguf_quantize(model_path_or_id, model_id, gguf_quant_type)
|
| 221 |
options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
|
| 222 |
else:
|
| 223 |
raise ValueError("Invalid pipeline type selected.")
|
| 224 |
full_log += log
|
| 225 |
-
|
| 226 |
-
full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
|
| 227 |
final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
|
| 228 |
full_log += log
|
| 229 |
-
|
| 230 |
yield {final_output: gr.update(value="SUCCESS", label="Status"), log_output: full_log, success_box: gr.Markdown(f"β
**Success!** Model available: [{repo_id_for_link}](https://huggingface.co/{repo_id_for_link})", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
|
| 231 |
except Exception as e:
|
| 232 |
logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
|
|
@@ -236,7 +215,6 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
|
|
| 236 |
if temp_model_dir and os.path.exists(temp_model_dir):
|
| 237 |
shutil.rmtree(temp_model_dir)
|
| 238 |
|
| 239 |
-
# --- GRADIO UI ---
|
| 240 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 241 |
gr.Markdown("# π AMOP: Adaptive Model Optimization Pipeline")
|
| 242 |
if not HF_TOKEN: gr.Warning("HF_TOKEN not set! The final 'upload' step will be skipped.")
|
|
@@ -264,13 +242,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 264 |
final_output = gr.Label(value="Idle", label="Status")
|
| 265 |
success_box = gr.Markdown(visible=False)
|
| 266 |
log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
|
| 267 |
-
|
| 268 |
def update_ui_for_pipeline(pipeline_type):
|
| 269 |
return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
|
| 270 |
-
|
| 271 |
pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
|
| 272 |
analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
|
| 273 |
-
|
| 274 |
run_button.click(fn=run_amop_pipeline,
|
| 275 |
inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
|
| 276 |
outputs=[run_button, analyze_button, final_output, log_output, success_box])
|
|
|
|
| 14 |
from optimum.onnxruntime.configuration import AutoQuantizationConfig
|
| 15 |
import torch.nn.utils.prune as prune
|
| 16 |
|
|
|
|
| 17 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 18 |
|
| 19 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
| 24 |
OUTPUT_DIR = "optimized_models"
|
| 25 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 26 |
|
|
|
|
| 27 |
LLAMA_CPP_DIR = Path("llama.cpp")
|
| 28 |
+
LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
|
|
|
|
| 29 |
LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize"
|
| 30 |
|
| 31 |
def setup_llama_cpp():
|
|
|
|
| 32 |
if not LLAMA_CPP_DIR.exists():
|
| 33 |
logging.info("Cloning llama.cpp repository...")
|
| 34 |
try:
|
| 35 |
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True, text=True)
|
| 36 |
logging.info("llama.cpp cloned successfully.")
|
| 37 |
except subprocess.CalledProcessError as e:
|
| 38 |
+
raise RuntimeError(f"Failed to clone llama.cpp. Error: {e.stderr}")
|
| 39 |
+
|
| 40 |
+
requirements_path = LLAMA_CPP_DIR / "requirements.txt"
|
| 41 |
+
if requirements_path.exists():
|
| 42 |
+
logging.info("Installing llama.cpp Python dependencies...")
|
| 43 |
+
try:
|
| 44 |
+
subprocess.run(["pip", "install", "-r", str(requirements_path)], check=True, capture_output=True, text=True)
|
| 45 |
+
logging.info("llama.cpp Python dependencies installed successfully.")
|
| 46 |
+
except subprocess.CalledProcessError as e:
|
| 47 |
+
raise RuntimeError(f"Failed to install llama.cpp requirements. Error: {e.stderr}")
|
| 48 |
|
| 49 |
if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
|
| 50 |
logging.info("llama.cpp 'quantize' binary not found. Building with CMake...")
|
|
|
|
| 53 |
subprocess.run(["cmake", "--build", ".", "--target", "quantize"], cwd=str(LLAMA_CPP_DIR), check=True, capture_output=True, text=True)
|
| 54 |
logging.info("'quantize' binary built successfully with CMake.")
|
| 55 |
except subprocess.CalledProcessError as e:
|
| 56 |
+
raise RuntimeError(f"Failed to build llama.cpp with CMake. Error: {e.stderr}")
|
|
|
|
|
|
|
| 57 |
|
|
|
|
| 58 |
try:
|
| 59 |
setup_llama_cpp()
|
| 60 |
except Exception as e:
|
| 61 |
logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
|
| 62 |
|
|
|
|
| 63 |
def stage_1_analyze_model(model_id: str):
|
| 64 |
log_stream = "[STAGE 1] Analyzing model...\n"
|
| 65 |
try:
|
|
|
|
| 94 |
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 95 |
model_name = model_path_or_id.split('/')[-1]
|
| 96 |
onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
|
|
|
|
| 97 |
try:
|
|
|
|
| 98 |
export_command = ["optimum-cli", "export", "onnx", "--model", model_path_or_id, "--trust-remote-code", onnx_path]
|
| 99 |
process = subprocess.run(export_command, check=True, capture_output=True, text=True)
|
| 100 |
+
log_stream += f"Executing `optimum-cli export onnx` for '{model_path_or_id}'...\n{process.stdout}\n"
|
| 101 |
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
|
|
|
|
| 102 |
except subprocess.CalledProcessError as e:
|
| 103 |
raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
|
|
|
|
| 104 |
try:
|
| 105 |
quantizer = ORTQuantizer.from_pretrained(onnx_path)
|
| 106 |
log_stream += "Performing DYNAMIC quantization...\n"
|
|
|
|
| 108 |
quantized_path = os.path.join(onnx_path, "quantized-dynamic")
|
| 109 |
quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
|
| 110 |
log_stream += f"Successfully quantized model to: {quantized_path}\n"
|
|
|
|
| 111 |
if not os.path.exists(os.path.join(quantized_path, 'tokenizer_config.json')):
|
| 112 |
AutoTokenizer.from_pretrained(model_path_or_id, trust_remote_code=True).save_pretrained(quantized_path)
|
| 113 |
log_stream += "Saved new tokenizer files.\n"
|
|
|
|
| 114 |
return quantized_path, log_stream
|
| 115 |
except Exception as e:
|
| 116 |
raise RuntimeError(f"Failed during ONNX quantization step. Error: {e}")
|
|
|
|
| 121 |
model_name = original_model_id.replace('/', '_')
|
| 122 |
gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
|
| 123 |
os.makedirs(gguf_path, exist_ok=True)
|
|
|
|
| 124 |
f16_gguf_path = os.path.join(gguf_path, "model-f16.gguf")
|
| 125 |
quantized_gguf_path = os.path.join(gguf_path, "model.gguf")
|
|
|
|
| 126 |
try:
|
|
|
|
| 127 |
convert_command = ["python3", str(LLAMA_CPP_CONVERT_SCRIPT), model_path_or_id, "--outfile", f16_gguf_path, "--outtype", "f16"]
|
| 128 |
process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
|
| 129 |
+
log_stream += f"Executing llama.cpp conversion script on '{model_path_or_id}'...\n{process.stdout}\n"
|
| 130 |
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
|
|
|
|
| 131 |
quantize_map = {"q4_k_m": "Q4_K_M", "q5_k_m": "Q5_K_M", "q8_0": "Q8_0", "f16": "F16"}
|
| 132 |
target_quant_name = quantize_map.get(quantization_strategy.lower(), "Q4_K_M")
|
|
|
|
| 133 |
if target_quant_name == "F16":
|
| 134 |
log_stream += "Target is F16, renaming file...\n"
|
| 135 |
os.rename(f16_gguf_path, quantized_gguf_path)
|
|
|
|
| 137 |
log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
|
| 138 |
quantize_command = [str(LLAMA_CPP_QUANTIZE_SCRIPT), f16_gguf_path, quantized_gguf_path, target_quant_name]
|
| 139 |
process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
|
| 140 |
+
log_stream += f"{process.stdout}\n"
|
| 141 |
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
|
| 142 |
os.remove(f16_gguf_path)
|
| 143 |
return gguf_path, log_stream
|
|
|
|
| 167 |
if not model_id:
|
| 168 |
yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
|
| 169 |
return
|
|
|
|
| 170 |
initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated for '{model_id}'.\n"
|
| 171 |
yield {run_button: gr.Button(interactive=False, value="π Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
|
|
|
|
| 172 |
full_log = initial_log
|
| 173 |
temp_model_dir = None
|
| 174 |
model_path_or_id = model_id
|
|
|
|
| 175 |
try:
|
| 176 |
whoami = api.whoami(token=HF_TOKEN)
|
| 177 |
if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
|
| 178 |
repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
|
|
|
|
| 179 |
if do_prune and prune_percent > 0:
|
| 180 |
full_log += f"\n[WARNING] Pruning is memory-intensive and may fail for large models.\n"
|
| 181 |
+
yield {final_output: "Loading model (1/5)", log_output: full_log}
|
| 182 |
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
|
| 183 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 184 |
full_log += f"Successfully loaded '{model_id}'.\n"
|
|
|
|
| 185 |
yield {final_output: "Pruning model (2/5)", log_output: full_log}
|
| 186 |
model, log = stage_2_prune_model(model, prune_percent)
|
| 187 |
full_log += log
|
|
|
|
| 188 |
temp_model_dir = tempfile.mkdtemp()
|
| 189 |
model.save_pretrained(temp_model_dir)
|
| 190 |
tokenizer.save_pretrained(temp_model_dir)
|
|
|
|
| 192 |
full_log += f"Saved intermediate pruned model to {temp_model_dir}\n"
|
| 193 |
else:
|
| 194 |
full_log += "Pruning skipped.\n"
|
|
|
|
| 195 |
if pipeline_type == "ONNX":
|
| 196 |
+
yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
|
| 197 |
optimized_path, log = stage_3_4_onnx_quantize(model_path_or_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
|
| 198 |
options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
|
| 199 |
elif pipeline_type == "GGUF":
|
| 200 |
+
yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
|
| 201 |
optimized_path, log = stage_3_4_gguf_quantize(model_path_or_id, model_id, gguf_quant_type)
|
| 202 |
options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
|
| 203 |
else:
|
| 204 |
raise ValueError("Invalid pipeline type selected.")
|
| 205 |
full_log += log
|
| 206 |
+
yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
|
|
|
|
| 207 |
final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
|
| 208 |
full_log += log
|
|
|
|
| 209 |
yield {final_output: gr.update(value="SUCCESS", label="Status"), log_output: full_log, success_box: gr.Markdown(f"β
**Success!** Model available: [{repo_id_for_link}](https://huggingface.co/{repo_id_for_link})", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
|
| 210 |
except Exception as e:
|
| 211 |
logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
|
|
|
|
| 215 |
if temp_model_dir and os.path.exists(temp_model_dir):
|
| 216 |
shutil.rmtree(temp_model_dir)
|
| 217 |
|
|
|
|
| 218 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 219 |
gr.Markdown("# π AMOP: Adaptive Model Optimization Pipeline")
|
| 220 |
if not HF_TOKEN: gr.Warning("HF_TOKEN not set! The final 'upload' step will be skipped.")
|
|
|
|
| 242 |
final_output = gr.Label(value="Idle", label="Status")
|
| 243 |
success_box = gr.Markdown(visible=False)
|
| 244 |
log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
|
|
|
|
| 245 |
def update_ui_for_pipeline(pipeline_type):
|
| 246 |
return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
|
|
|
|
| 247 |
pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
|
| 248 |
analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
|
|
|
|
| 249 |
run_button.click(fn=run_amop_pipeline,
|
| 250 |
inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
|
| 251 |
outputs=[run_button, analyze_button, final_output, log_output, success_box])
|