Spaces:

broadfield-dev
/

AMOP

Paused

App Files Files Community

broadfield-dev commited on Sep 1

Commit

32de6da

verified ·

1 Parent(s): dc10655

Create app.py

Browse files

Files changed (1) hide show

app.py +237 -0

app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import gradio as gr
+import torch
+import os
+import logging
+from datetime import datetime
+from huggingface_hub import HfApi, HfFolder
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
+from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
+from optimum.onnxruntime.configuration import AutoQuantizationConfig
+from optimum.onnx import export
+from optimum.onnx.utils import get_preprocessor
+from datasets import load_dataset
+import torch.nn.utils.prune as prune
+import numpy as np
+import time
+# --- 1. SETUP AND CONFIGURATION ---
+# Setup basic logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Ensure the user has set their Hugging Face token in the Space secrets
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
+    # For testing locally, you can uncomment the next line and set your token
+    # HfFolder.save_token('YOUR_HF_WRITE_TOKEN')
+api = HfApi()
+OUTPUT_DIR = "optimized_models"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# --- 2. AMOP CORE PIPELINE FUNCTIONS ---
+def stage_1_analyze_model(model_id: str):
+    """
+    Performs Stage 1: Adaptive Model Analysis.
+    Loads the model's configuration and recommends an optimization strategy.
+    """
+    log_stream = "[STAGE 1] Analyzing model...\n"
+    try:
+        config = AutoConfig.from_pretrained(model_id)
+        model_type = config.model_type
+        num_params = getattr(config, "num_hidden_layers", "N/A") * getattr(config, "hidden_size", 0) / 1e6 # A rough estimate
+        analysis_report = f"""
+        ### Model Analysis Report
+        - **Model ID:** `{model_id}`
+        - **Architecture:** `{model_type}`
+        - **Estimated Parameters:** ~{num_params:.2f}M
+        """
+        # Recommendation Logic
+        recommendation = ""
+        if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
+            recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
+        elif 'bert' in model_type or 'roberta' in model_type:
+            recommendation = "**Recommendation:** This is an encoder model. The full AMOP pipeline is recommended for a balance of size and performance: **Pruning -> Quantization -> ONNX Conversion**."
+        elif 'vit' in model_type:
+             recommendation = "**Recommendation:** This is a Vision Transformer. The recommended path is **Quantization -> ONNX Conversion**. Pruning may be less effective."
+        else:
+            recommendation = "**Recommendation:** Unrecognized architecture. The standard path of **Quantization -> ONNX Conversion** is a safe starting point."
+        log_stream += f"Analysis complete. Architecture: {model_type}.\n"
+        return log_stream, analysis_report + "\n" + recommendation, gr.update(visible=True)
+    except Exception as e:
+        error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
+        logging.error(error_msg)
+        return log_stream + error_msg, "Could not analyze model. Please check the model ID and try again.", gr.update(visible=False)
+def stage_2_prune_model(model, prune_percentage: float, progress):
+    """
+    Performs Stage 2: Structural Reduction via one-shot unstructured pruning.
+    """
+    if prune_percentage == 0:
+        return model, "Skipped pruning as percentage was 0."
+    log_stream = "[STAGE 2] Pruning model...\n"
+    progress(0.25, desc="Applying Unstructured Pruning")
+    total_params = sum(p.numel() for p in model.parameters())
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
+            prune.remove(module, 'weight') # Makes the pruning permanent
+    pruned_params = sum(p.numel() for p in model.parameters())
+    reduction = (total_params - pruned_params) / total_params * 100
+    log_stream += f"Pruning complete. Parameter reduction: ~{reduction:.2f}%\n"
+    return model, log_stream
+def stage_3_and_4_quantize_and_onnx(model_id: str, model, progress):
+    """
+    Performs Stage 3 (Quantization) and Stage 4 (ONNX Conversion).
+    This version uses post-training dynamic quantization.
+    """
+    log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
+    progress(0.5, desc="Exporting to ONNX")
+    try:
+        # Define a unique path for this run
+        run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
+        onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
+        os.makedirs(onnx_path, exist_ok=True)
+        onnx_model_path = os.path.join(onnx_path, "model.onnx")
+        # Export the base model to ONNX
+        # Using a trick to get the task for optimum
+        config = AutoConfig.from_pretrained(model_id)
+        task = getattr(config, "task_specific_params", None)
+        task = "default" if task is None else list(task.keys())[0] if isinstance(task, dict) else "default"
+        # Load preprocessor for ONNX export
+        preprocessor = get_preprocessor(model_id)
+        # This is a key step where we need to find the correct OnnxConfig
+        # Optimum has utilities, but for a general case, we try our best
+        from optimum.exporters.onnx import main_export
+        main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
+        log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
+        # Quantize the ONNX model
+        progress(0.7, desc="Applying Dynamic Quantization")
+        quantizer = ORTQuantizer.from_pretrained(onnx_path)
+        dqconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False) # Dynamic quantization
+        quantized_path = os.path.join(onnx_path, "quantized")
+        quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
+        log_stream += f"Successfully quantized model to: {quantized_path}\n"
+        return quantized_path, log_stream
+    except Exception as e:
+        error_msg = f"Failed during ONNX conversion/quantization. Error: {e}"
+        logging.error(error_msg, exc_info=True)
+        raise RuntimeError(error_msg)
+def stage_5_evaluate_and_package(
+    model_id: str,
+    optimized_model_path: str,
+    pipeline_log: str,
+    options: dict,
+    progress
+):
+    """
+    Performs Stage 5: Evaluation, Packaging, and Uploading.
+    """
+    log_stream = "[STAGE 5] Evaluating and Packaging...\n"
+    progress(0.9, desc="Evaluating performance")
+    # Simple evaluation: Load the model and measure latency
+    try:
+        ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        prompt = "My name is Philipp and I"
+        inputs = tokenizer(prompt, return_tensors="pt")
+        start_time = time.time()
+        gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
+        end_time = time.time()
+        latency = (end_time - start_time) * 1000 # in ms
+        num_tokens = len(gen_tokens[0])
+        ms_per_token = latency / num_tokens
+        eval_report = f"- **Inference Latency:** {latency:.2f} ms\n"
+        eval_report += f"- **Speed:** {ms_per_token:.2f} ms/token\n"
+        log_stream += "Evaluation complete.\n"
+    except Exception as e:
+        eval_report = f"- **Evaluation Failed:** Could not load and test the ONNX model. This often happens if the base model is not a text-generation model. Error: {e}\n"
+        log_stream += f"Warning: Evaluation failed. {e}\n"
+    # Package and upload
+    progress(0.95, desc="Uploading to Hugging Face Hub")
+    if not HF_TOKEN:
+        return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
+    try:
+        # Create a new repo
+        repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
+        repo_url = api.create_repo(
+            repo_id=repo_name,
+            exist_ok=True,
+            token=HF_TOKEN
+        )
+        # Generate the Model Card (README.md)
+        model_card_content = f"""
+---
+license: mit
+tags:
+- amop-optimized
+- onnx
+---
+# AMOP-Optimized CPU Model: {repo_name}
+This model was automatically optimized for CPU inference using the **Adaptive Model Optimization Pipeline (AMOP)**.
+- **Base Model:** [{model_id}](https://huggingface.co/{model_id})
+- **Optimization Date:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+## Optimization Details
+The following AMOP stages were applied:
+- **Stage 2: Pruning:** {"Enabled" if options['prune'] else "Disabled"} (Percentage: {options['prune_percent']}%)
+- **Stage 3 & 4: Quantization & ONNX Conversion:** Enabled (Dynamic Quantization)
+## Performance Metrics
+{eval_report}
+## How to Use
+This model is in ONNX format and can be run with `optimum-onnxruntime`.
+```python
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer
+model_id = "{repo_url.repo_id}"
+model = ORTModelForCausalLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+prompt = "The future of AI is"
+inputs = tokenizer(prompt, return_tensors="pt")
+gen_tokens = model.generate(**inputs)
+print(tokenizer.batch_decode(gen_tokens))