Spaces:

broadfield-dev
/

AMOP

Paused

App Files Files Community

broadfield-dev commited on Sep 14

Commit

21d341d

verified ·

1 Parent(s): 6a6f272

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -100

app.py CHANGED Viewed

@@ -9,10 +9,7 @@ import subprocess
 from datetime import datetime
 from pathlib import Path
 from huggingface_hub import HfApi
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-from optimum.onnxruntime import ORTQuantizer
-from optimum.onnxruntime.configuration import AutoQuantizationConfig
-import torch.nn.utils.prune as prune
 # --- SETUP ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -29,14 +26,18 @@ os.makedirs(OUTPUT_DIR, exist_ok=True)
 LLAMA_CPP_DIR = Path("llama.cpp")
 def setup_llama_cpp():
-    """Clones llama.cpp if not already present."""
     if not LLAMA_CPP_DIR.exists():
         logging.info("Cloning llama.cpp repository...")
         try:
-            subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True)
             logging.info("llama.cpp cloned successfully.")
         except subprocess.CalledProcessError as e:
-            error_msg = f"Failed to clone llama.cpp. This is required for GGUF conversion. Error: {e.stderr.decode()}"
             logging.error(error_msg, exc_info=True)
             raise RuntimeError(error_msg)
@@ -44,14 +45,9 @@ def setup_llama_cpp():
 try:
     setup_llama_cpp()
     LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
-    # Note: llama.cpp's quantize script is also a python script now in many versions
-    LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize.py"
-    if not LLAMA_CPP_QUANTIZE_SCRIPT.exists(): # Fallback for older versions with compiled binary
-         LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize"
-         # Attempt to build if not found
-         if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
-             subprocess.run(["make", "-C", "llama.cpp", "quantize"], check=True, capture_output=True)
 except Exception as e:
     logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
     # The app will likely fail to start, which is appropriate.
@@ -75,63 +71,68 @@ def stage_1_analyze_model(model_id: str):
         logging.error(error_msg)
         return log_stream + error_msg, "Could not analyze model.", gr.Accordion(open=False)
-def stage_2_prune_model(model, prune_percentage: float):
-    if prune_percentage == 0:
-        return model, "Skipped pruning as percentage was 0."
-    log_stream = "[STAGE 2] Pruning model...\n"
-    for name, module in model.named_modules():
-        if isinstance(module, torch.nn.Linear):
-            prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
-            prune.remove(module, 'weight')
-    log_stream += f"Pruning complete with {prune_percentage}% target.\n"
-    return model, log_stream
-def stage_3_4_onnx_quantize(model_path: str, calibration_data_path: str):
-    log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
     run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
-    model_name = os.path.basename(model_path)
-    onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
     try:
-        log_stream += "Executing `optimum-cli export onnx`...\n"
-        export_command = ["optimum-cli", "export", "onnx", "--model", model_path, "--trust-remote-code", onnx_path]
         process = subprocess.run(export_command, check=True, capture_output=True, text=True)
         log_stream += process.stdout
         if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
-        log_stream += f"Successfully exported to ONNX at: {onnx_path}\n"
     except subprocess.CalledProcessError as e:
         raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
     try:
-        quantizer = ORTQuantizer.from_pretrained(onnx_path)
-        if calibration_data_path:
-            log_stream += "Performing STATIC quantization...\n"
-            dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=True, per_channel=False)
-            quantized_path = os.path.join(onnx_path, "quantized-static")
-            quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig, calibration_dataset=quantizer.get_calibration_dataset("text", dataset_args={"path": calibration_data_path, "split": "train"}, num_samples=100))
         else:
-            log_stream += "Performing DYNAMIC quantization...\n"
-            dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
-            quantized_path = os.path.join(onnx_path, "quantized-dynamic")
-            quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
-        log_stream += f"Successfully quantized model to: {quantized_path}\n"
         return quantized_path, log_stream
     except Exception as e:
-        raise RuntimeError(f"Failed during ONNX quantization step. Error: {e}")
-def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strategy: str):
-    log_stream = "[STAGE 3 & 4] Converting to GGUF using llama.cpp...\n"
     run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
-    model_name = model_id.replace('/', '_')
-    gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
-    os.makedirs(gguf_path, exist_ok=True)
-    f16_gguf_path = os.path.join(gguf_path, "model-f16.gguf")
-    quantized_gguf_path = os.path.join(gguf_path, "model.gguf")
     try:
         log_stream += "Executing llama.cpp convert.py script...\n"
-        convert_command = ["python", str(LLAMA_CPP_CONVERT_SCRIPT), model_path, "--outfile", f16_gguf_path, "--outtype", "f16"]
         process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
         log_stream += process.stdout
         if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
@@ -141,24 +142,23 @@ def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strateg
         if target_quant_name == "F16":
             log_stream += "Target is F16, renaming file...\n"
-            os.rename(f16_gguf_path, quantized_gguf_path)
         else:
             log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
-            quantize_cmd_base = [str(LLAMA_CPP_QUANTIZE_SCRIPT)] if LLAMA_CPP_QUANTIZE_SCRIPT.is_file() and os.access(LLAMA_CPP_QUANTIZE_SCRIPT, os.X_OK) else ["python", str(LLAMA_CPP_QUANTIZE_SCRIPT)]
-            quantize_command = quantize_cmd_base + [f16_gguf_path, quantized_gguf_path, target_quant_name]
             process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
             log_stream += process.stdout
             if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
-            os.remove(f16_gguf_path)
-        return gguf_path, log_stream
     except subprocess.CalledProcessError as e:
         raise RuntimeError(f"Failed during llama.cpp execution. Error:\n{e.stderr}")
     except Exception as e:
         raise RuntimeError(f"An unexpected error occurred during GGUF conversion. Error: {e}")
 def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
-    # This function remains correct and does not need changes
-    log_stream = "[STAGE 5] Packaging and Uploading...\n"
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
     try:
@@ -166,58 +166,44 @@ def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipelin
         repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
         template_file = "model_card_template_gguf.md" if options['pipeline_type'] == "GGUF" else "model_card_template.md"
         with open(template_file, "r", encoding="utf-8") as f: template_content = f.read()
-        model_card_content = template_content.format(repo_name=repo_name, model_id=model_id, optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), pruning_status="Enabled" if options.get('prune', False) else "Disabled", pruning_percent=options.get('prune_percent', 0), quant_type=options.get('quant_type', 'N/A'), repo_id=repo_url.repo_id, pipeline_log=pipeline_log)
         with open(os.path.join(optimized_model_path, "README.md"), "w", encoding="utf-8") as f: f.write(model_card_content)
-        if options['pipeline_type'] == "ONNX":
-            AutoTokenizer.from_pretrained(model_id, trust_remote_code=True).save_pretrained(optimized_model_path)
         api.upload_folder(folder_path=optimized_model_path, repo_id=repo_url.repo_id, repo_type="model", token=HF_TOKEN)
-        log_stream += "Upload complete.\n"
         return f"Success! Your optimized model is available at: huggingface.co/{repo_url.repo_id}", log_stream
     except Exception as e:
         raise RuntimeError(f"Failed to upload to the Hub. Error: {e}")
-def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_percent: float, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
     if not model_id:
         yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
         return
-    initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated.\n"
     yield {run_button: gr.Button(interactive=False, value="🚀 Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
     full_log = initial_log
-    temp_model_dir = None
     try:
-        whoami = api.whoami()
         if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
         repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
-        full_log += "Loading base model...\n"; yield {final_output: "Loading model (1/5)", log_output: full_log}
-        model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        full_log += f"Successfully loaded '{model_id}'.\n"
-        full_log += "Pruning model...\n"; yield {final_output: "Pruning model (2/5)", log_output: full_log}
-        model, log = stage_2_prune_model(model, prune_percent if do_prune else 0)
-        full_log += log
-        temp_model_dir = tempfile.mkdtemp()
-        model.save_pretrained(temp_model_dir)
-        tokenizer.save_pretrained(temp_model_dir)
-        full_log += f"Saved intermediate model to {temp_model_dir}\n"
         if pipeline_type == "ONNX":
-            full_log += "Converting to ONNX...\n"; yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
-            optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
-            options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
         elif pipeline_type == "GGUF":
-            full_log += "Converting to GGUF...\n"; yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
-            optimized_path, log = stage_3_4_gguf_quantize(temp_model_dir, model_id, gguf_quant_type)
-            options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
         else:
             raise ValueError("Invalid pipeline type selected.")
         full_log += log
-        full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
         final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
         full_log += log
@@ -227,8 +213,11 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
         full_log += f"\n[ERROR] Pipeline failed: {e}"
         yield {final_output: gr.update(value="ERROR", label="Status"), log_output: full_log, success_box: gr.Markdown(f"❌ **An error occurred.** Check logs for details.", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
     finally:
-        if temp_model_dir and os.path.exists(temp_model_dir):
-            shutil.rmtree(temp_model_dir)
 # --- GRADIO UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -242,12 +231,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             with gr.Accordion("⚙️ 2. Configure Optimization", open=False) as optimization_accordion:
                 analysis_report_output = gr.Markdown()
                 pipeline_type_radio = gr.Radio(["ONNX", "GGUF"], label="Select Optimization Pipeline")
-                prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights.", visible=True)
-                prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=True)
                 with gr.Group(visible=False) as onnx_options:
                     gr.Markdown("#### ONNX Options")
-                    onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="Quantization Type", value="Dynamic")
-                    calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
                 with gr.Group(visible=False) as gguf_options:
                     gr.Markdown("#### GGUF Options")
                     gguf_quant_dropdown = gr.Dropdown(["q4_k_m", "q5_k_m", "q8_0", "f16"], label="Quantization Strategy", value="q4_k_m")
@@ -260,13 +252,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     def update_ui_for_pipeline(pipeline_type):
         return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
-    def update_ui_for_quant_type(quant_type):
-        return gr.File(visible=quant_type == "Static")
     pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
-    onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
     analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
-    run_button.click(fn=run_amop_pipeline, inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown], outputs=[run_button, analyze_button, final_output, log_output, success_box])
 if __name__ == "__main__":
-    demo.launch(debug=True)

 from datetime import datetime
 from pathlib import Path
 from huggingface_hub import HfApi
+from transformers import AutoConfig, AutoTokenizer # Keep AutoTokenizer for ONNX pipeline
 # --- SETUP ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 LLAMA_CPP_DIR = Path("llama.cpp")
 def setup_llama_cpp():
+    """Clones llama.cpp if not already present and builds it."""
     if not LLAMA_CPP_DIR.exists():
         logging.info("Cloning llama.cpp repository...")
         try:
+            subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True, capture_output=True, text=True)
             logging.info("llama.cpp cloned successfully.")
+            logging.info("Building llama.cpp...")
+            # Build the required tools
+            subprocess.run(["make", "-C", "llama.cpp", "quantize", "convert.py"], check=True, capture_output=True, text=True)
+            logging.info("llama.cpp built successfully.")
         except subprocess.CalledProcessError as e:
+            error_msg = f"Failed to clone or build llama.cpp. This is required for GGUF conversion. Error: {e.stderr}"
             logging.error(error_msg, exc_info=True)
             raise RuntimeError(error_msg)
 try:
     setup_llama_cpp()
     LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
+    LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize" # This is a binary, not a python script
+    if not LLAMA_CPP_CONVERT_SCRIPT.exists() or not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
+        raise RuntimeError("llama.cpp scripts/binaries not found after setup.")
 except Exception as e:
     logging.error(f"FATAL ERROR during llama.cpp setup: {e}", exc_info=True)
     # The app will likely fail to start, which is appropriate.
         logging.error(error_msg)
         return log_stream + error_msg, "Could not analyze model.", gr.Accordion(open=False)
+def stage_3_4_onnx_quantize(model_id: str, onnx_quant_type: str, calibration_data_path: str):
+    # MODIFIED: Takes model_id directly
+    log_stream = "[STAGE 2 & 3] Converting to ONNX and Quantizing...\n"
     run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
+    model_name = model_id.split('/')[-1]
+    onnx_base_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx-unquantized")
     try:
+        log_stream += f"Executing `optimum-cli export onnx` for model '{model_id}'...\n"
+        export_command = ["optimum-cli", "export", "onnx", "--model", model_id, "--trust-remote-code", onnx_base_path]
         process = subprocess.run(export_command, check=True, capture_output=True, text=True)
         log_stream += process.stdout
         if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
+        log_stream += f"Successfully exported to ONNX at: {onnx_base_path}\n"
     except subprocess.CalledProcessError as e:
         raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
     try:
+        log_stream += f"Executing `optimum-cli onnx quantize` for model at '{onnx_base_path}'...\n"
+        quantized_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx-quantized")
+        quantize_command = ["optimum-cli", "onnx", "quantize", "--onnx_model", onnx_base_path, "--avx512", "-o", quantized_path]
+        if onnx_quant_type == "Static" and calibration_data_path:
+            log_stream += "Using STATIC quantization with provided calibration data.\n"
+            # NOTE: optimum-cli quantization is more complex for static. This example simplifies to dynamic.
+            # For a real implementation, you would need to construct a more complex calibration configuration.
+            # For stability in a public space, we'll stick to the more reliable dynamic quantization.
+            log_stream += "[WARNING] Static quantization via CLI is complex and not fully implemented in this UI. Falling back to dynamic.\n"
+            quantize_command.append("--dynamic")
         else:
+            log_stream += "Using DYNAMIC quantization...\n"
+            quantize_command.append("--dynamic")
+        process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
+        log_stream += process.stdout
+        if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
+        # Copy tokenizer config
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer.save_pretrained(quantized_path)
+        log_stream += f"Successfully quantized model and saved tokenizer to: {quantized_path}\n"
         return quantized_path, log_stream
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Failed during `optimum-cli onnx quantize`. Error:\n{e.stderr}")
     except Exception as e:
+        raise RuntimeError(f"An unexpected error occurred during ONNX processing. Error: {e}")
+def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
+    # MODIFIED: Takes model_id directly
+    log_stream = "[STAGE 2 & 3] Converting to GGUF using llama.cpp...\n"
     run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
+    model_name_sanitized = model_id.replace('/', '_')
+    gguf_output_dir = os.path.join(OUTPUT_DIR, f"{model_name_sanitized}-{run_id}-gguf")
+    os.makedirs(gguf_output_dir, exist_ok=True)
+    f16_gguf_path = os.path.join(gguf_output_dir, "model-f16.gguf")
+    final_quantized_gguf_path = os.path.join(gguf_output_dir, "model.gguf")
     try:
         log_stream += "Executing llama.cpp convert.py script...\n"
+        # The convert script can take the model ID directly and will use the cache
+        convert_command = ["python3", str(LLAMA_CPP_CONVERT_SCRIPT), model_id, "--outfile", f16_gguf_path, "--outtype", "f16"]
         process = subprocess.run(convert_command, check=True, capture_output=True, text=True)
         log_stream += process.stdout
         if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
         if target_quant_name == "F16":
             log_stream += "Target is F16, renaming file...\n"
+            os.rename(f16_gguf_path, final_quantized_gguf_path)
         else:
             log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
+            quantize_command = [str(LLAMA_CPP_QUANTIZE_SCRIPT), f16_gguf_path, final_quantized_gguf_path, target_quant_name]
             process = subprocess.run(quantize_command, check=True, capture_output=True, text=True)
             log_stream += process.stdout
             if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
+            os.remove(f16_gguf_path) # Clean up intermediate file
+        return gguf_output_dir, log_stream
     except subprocess.CalledProcessError as e:
         raise RuntimeError(f"Failed during llama.cpp execution. Error:\n{e.stderr}")
     except Exception as e:
         raise RuntimeError(f"An unexpected error occurred during GGUF conversion. Error: {e}")
 def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
+    # This function remains mostly correct, just updated placeholder for pruning
+    log_stream = "[STAGE 4] Packaging and Uploading...\n"
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
     try:
         repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
         template_file = "model_card_template_gguf.md" if options['pipeline_type'] == "GGUF" else "model_card_template.md"
         with open(template_file, "r", encoding="utf-8") as f: template_content = f.read()
+        # Updated pruning status to be hardcoded as disabled
+        model_card_content = template_content.format(repo_name=repo_name, model_id=model_id, optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), pruning_status="Disabled", pruning_percent=0, quant_type=options.get('quant_type', 'N/A'), repo_id=repo_url.repo_id, pipeline_log=pipeline_log)
         with open(os.path.join(optimized_model_path, "README.md"), "w", encoding="utf-8") as f: f.write(model_card_content)
         api.upload_folder(folder_path=optimized_model_path, repo_id=repo_url.repo_id, repo_type="model", token=HF_TOKEN)
+        log_stream += f"Upload complete to {repo_url.repo_id}.\n"
         return f"Success! Your optimized model is available at: huggingface.co/{repo_url.repo_id}", log_stream
     except Exception as e:
         raise RuntimeError(f"Failed to upload to the Hub. Error: {e}")
+def run_amop_pipeline(model_id: str, pipeline_type: str, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
+    # REFACTORED: Removed pruning and in-memory model loading
     if not model_id:
         yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
         return
+    initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated for model '{model_id}'.\n"
     yield {run_button: gr.Button(interactive=False, value="🚀 Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
     full_log = initial_log
     try:
+        whoami = api.whoami(token=HF_TOKEN)
         if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
         repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
+        # The pipeline now has fewer, more robust steps
         if pipeline_type == "ONNX":
+            full_log += "Starting ONNX Conversion & Quantization...\n"; yield {final_output: "Converting to ONNX (1/3)", log_output: full_log}
+            optimized_path, log = stage_3_4_onnx_quantize(model_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
+            options = {'pipeline_type': 'ONNX', 'quant_type': onnx_quant_type}
         elif pipeline_type == "GGUF":
+            full_log += "Starting GGUF Conversion & Quantization...\n"; yield {final_output: "Converting to GGUF (1/3)", log_output: full_log}
+            optimized_path, log = stage_3_4_gguf_quantize(model_id, gguf_quant_type)
+            options = {'pipeline_type': 'GGUF', 'quant_type': gguf_quant_type}
         else:
             raise ValueError("Invalid pipeline type selected.")
         full_log += log
+        full_log += "Packaging & Uploading...\n"; yield {final_output: "Packaging & Uploading (2/3)", log_output: full_log}
         final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
         full_log += log
         full_log += f"\n[ERROR] Pipeline failed: {e}"
         yield {final_output: gr.update(value="ERROR", label="Status"), log_output: full_log, success_box: gr.Markdown(f"❌ **An error occurred.** Check logs for details.", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
     finally:
+        # Clean up entire output directory to save space
+        if os.path.exists(OUTPUT_DIR):
+            shutil.rmtree(OUTPUT_DIR)
+            os.makedirs(OUTPUT_DIR, exist_ok=True)
 # --- GRADIO UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
             with gr.Accordion("⚙️ 2. Configure Optimization", open=False) as optimization_accordion:
                 analysis_report_output = gr.Markdown()
                 pipeline_type_radio = gr.Radio(["ONNX", "GGUF"], label="Select Optimization Pipeline")
+                # Pruning is removed for stability on HF Spaces
+                # prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights.", visible=True)
+                # prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=True)
+                gr.Markdown("<p style='color:grey;font-size:0.9em;'>Note: Pruning has been disabled to ensure stability on resource-constrained hardware.</p>")
                 with gr.Group(visible=False) as onnx_options:
                     gr.Markdown("#### ONNX Options")
+                    onnx_quant_radio = gr.Radio(["Dynamic"], label="Quantization Type", value="Dynamic", info="Static quantization is not supported in this version.") # Simplified
+                    # Hiding calibration for now as it adds complexity
+                    # calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
                 with gr.Group(visible=False) as gguf_options:
                     gr.Markdown("#### GGUF Options")
                     gguf_quant_dropdown = gr.Dropdown(["q4_k_m", "q5_k_m", "q8_0", "f16"], label="Quantization Strategy", value="q4_k_m")
     def update_ui_for_pipeline(pipeline_type):
         return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
     pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
     analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
+    # MODIFIED: Removed pruning inputs from the click function
+    run_button.click(fn=run_amop_pipeline,
+                     inputs=[model_id_input, pipeline_type_radio, onnx_quant_radio, gr.State(None), gguf_quant_dropdown], # Using gr.State(None) as placeholder for removed file upload
+                     outputs=[run_button, analyze_button, final_output, log_output, success_box])
 if __name__ == "__main__":
+    # IMPORTANT: Added .queue() for handling long-running jobs
+    demo.queue().launch(debug=True)