broadfield-dev commited on
Commit
32de6da
·
verified ·
1 Parent(s): dc10655

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -0
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ import logging
5
+ from datetime import datetime
6
+ from huggingface_hub import HfApi, HfFolder
7
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
8
+ from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
9
+ from optimum.onnxruntime.configuration import AutoQuantizationConfig
10
+ from optimum.onnx import export
11
+ from optimum.onnx.utils import get_preprocessor
12
+ from datasets import load_dataset
13
+ import torch.nn.utils.prune as prune
14
+ import numpy as np
15
+ import time
16
+
17
+ # --- 1. SETUP AND CONFIGURATION ---
18
+
19
+ # Setup basic logging
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+
22
+ # Ensure the user has set their Hugging Face token in the Space secrets
23
+ HF_TOKEN = os.getenv("HF_TOKEN")
24
+ if not HF_TOKEN:
25
+ logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
26
+ # For testing locally, you can uncomment the next line and set your token
27
+ # HfFolder.save_token('YOUR_HF_WRITE_TOKEN')
28
+
29
+ api = HfApi()
30
+ OUTPUT_DIR = "optimized_models"
31
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
32
+
33
+
34
+ # --- 2. AMOP CORE PIPELINE FUNCTIONS ---
35
+
36
+ def stage_1_analyze_model(model_id: str):
37
+ """
38
+ Performs Stage 1: Adaptive Model Analysis.
39
+ Loads the model's configuration and recommends an optimization strategy.
40
+ """
41
+ log_stream = "[STAGE 1] Analyzing model...\n"
42
+ try:
43
+ config = AutoConfig.from_pretrained(model_id)
44
+ model_type = config.model_type
45
+ num_params = getattr(config, "num_hidden_layers", "N/A") * getattr(config, "hidden_size", 0) / 1e6 # A rough estimate
46
+
47
+ analysis_report = f"""
48
+ ### Model Analysis Report
49
+ - **Model ID:** `{model_id}`
50
+ - **Architecture:** `{model_type}`
51
+ - **Estimated Parameters:** ~{num_params:.2f}M
52
+ """
53
+
54
+ # Recommendation Logic
55
+ recommendation = ""
56
+ if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
57
+ recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
58
+ elif 'bert' in model_type or 'roberta' in model_type:
59
+ recommendation = "**Recommendation:** This is an encoder model. The full AMOP pipeline is recommended for a balance of size and performance: **Pruning -> Quantization -> ONNX Conversion**."
60
+ elif 'vit' in model_type:
61
+ recommendation = "**Recommendation:** This is a Vision Transformer. The recommended path is **Quantization -> ONNX Conversion**. Pruning may be less effective."
62
+ else:
63
+ recommendation = "**Recommendation:** Unrecognized architecture. The standard path of **Quantization -> ONNX Conversion** is a safe starting point."
64
+
65
+ log_stream += f"Analysis complete. Architecture: {model_type}.\n"
66
+ return log_stream, analysis_report + "\n" + recommendation, gr.update(visible=True)
67
+ except Exception as e:
68
+ error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
69
+ logging.error(error_msg)
70
+ return log_stream + error_msg, "Could not analyze model. Please check the model ID and try again.", gr.update(visible=False)
71
+
72
+
73
+ def stage_2_prune_model(model, prune_percentage: float, progress):
74
+ """
75
+ Performs Stage 2: Structural Reduction via one-shot unstructured pruning.
76
+ """
77
+ if prune_percentage == 0:
78
+ return model, "Skipped pruning as percentage was 0."
79
+
80
+ log_stream = "[STAGE 2] Pruning model...\n"
81
+ progress(0.25, desc="Applying Unstructured Pruning")
82
+
83
+ total_params = sum(p.numel() for p in model.parameters())
84
+
85
+ for name, module in model.named_modules():
86
+ if isinstance(module, torch.nn.Linear):
87
+ prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
88
+ prune.remove(module, 'weight') # Makes the pruning permanent
89
+
90
+ pruned_params = sum(p.numel() for p in model.parameters())
91
+ reduction = (total_params - pruned_params) / total_params * 100
92
+
93
+ log_stream += f"Pruning complete. Parameter reduction: ~{reduction:.2f}%\n"
94
+ return model, log_stream
95
+
96
+
97
+ def stage_3_and_4_quantize_and_onnx(model_id: str, model, progress):
98
+ """
99
+ Performs Stage 3 (Quantization) and Stage 4 (ONNX Conversion).
100
+ This version uses post-training dynamic quantization.
101
+ """
102
+ log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
103
+ progress(0.5, desc="Exporting to ONNX")
104
+
105
+ try:
106
+ # Define a unique path for this run
107
+ run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
108
+ onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
109
+ os.makedirs(onnx_path, exist_ok=True)
110
+ onnx_model_path = os.path.join(onnx_path, "model.onnx")
111
+
112
+ # Export the base model to ONNX
113
+ # Using a trick to get the task for optimum
114
+ config = AutoConfig.from_pretrained(model_id)
115
+ task = getattr(config, "task_specific_params", None)
116
+ task = "default" if task is None else list(task.keys())[0] if isinstance(task, dict) else "default"
117
+
118
+ # Load preprocessor for ONNX export
119
+ preprocessor = get_preprocessor(model_id)
120
+
121
+ # This is a key step where we need to find the correct OnnxConfig
122
+ # Optimum has utilities, but for a general case, we try our best
123
+ from optimum.exporters.onnx import main_export
124
+ main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
125
+
126
+ log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
127
+
128
+ # Quantize the ONNX model
129
+ progress(0.7, desc="Applying Dynamic Quantization")
130
+ quantizer = ORTQuantizer.from_pretrained(onnx_path)
131
+ dqconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False) # Dynamic quantization
132
+
133
+ quantized_path = os.path.join(onnx_path, "quantized")
134
+ quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
135
+
136
+ log_stream += f"Successfully quantized model to: {quantized_path}\n"
137
+ return quantized_path, log_stream
138
+
139
+ except Exception as e:
140
+ error_msg = f"Failed during ONNX conversion/quantization. Error: {e}"
141
+ logging.error(error_msg, exc_info=True)
142
+ raise RuntimeError(error_msg)
143
+
144
+
145
+ def stage_5_evaluate_and_package(
146
+ model_id: str,
147
+ optimized_model_path: str,
148
+ pipeline_log: str,
149
+ options: dict,
150
+ progress
151
+ ):
152
+ """
153
+ Performs Stage 5: Evaluation, Packaging, and Uploading.
154
+ """
155
+ log_stream = "[STAGE 5] Evaluating and Packaging...\n"
156
+ progress(0.9, desc="Evaluating performance")
157
+
158
+ # Simple evaluation: Load the model and measure latency
159
+ try:
160
+ ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
161
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
162
+
163
+ prompt = "My name is Philipp and I"
164
+ inputs = tokenizer(prompt, return_tensors="pt")
165
+
166
+ start_time = time.time()
167
+ gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
168
+ end_time = time.time()
169
+
170
+ latency = (end_time - start_time) * 1000 # in ms
171
+ num_tokens = len(gen_tokens[0])
172
+ ms_per_token = latency / num_tokens
173
+
174
+ eval_report = f"- **Inference Latency:** {latency:.2f} ms\n"
175
+ eval_report += f"- **Speed:** {ms_per_token:.2f} ms/token\n"
176
+ log_stream += "Evaluation complete.\n"
177
+ except Exception as e:
178
+ eval_report = f"- **Evaluation Failed:** Could not load and test the ONNX model. This often happens if the base model is not a text-generation model. Error: {e}\n"
179
+ log_stream += f"Warning: Evaluation failed. {e}\n"
180
+
181
+ # Package and upload
182
+ progress(0.95, desc="Uploading to Hugging Face Hub")
183
+
184
+ if not HF_TOKEN:
185
+ return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
186
+
187
+ try:
188
+ # Create a new repo
189
+ repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
190
+ repo_url = api.create_repo(
191
+ repo_id=repo_name,
192
+ exist_ok=True,
193
+ token=HF_TOKEN
194
+ )
195
+
196
+ # Generate the Model Card (README.md)
197
+ model_card_content = f"""
198
+ ---
199
+ license: mit
200
+ tags:
201
+ - amop-optimized
202
+ - onnx
203
+ ---
204
+
205
+ # AMOP-Optimized CPU Model: {repo_name}
206
+
207
+ This model was automatically optimized for CPU inference using the **Adaptive Model Optimization Pipeline (AMOP)**.
208
+
209
+ - **Base Model:** [{model_id}](https://huggingface.co/{model_id})
210
+ - **Optimization Date:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
211
+
212
+ ## Optimization Details
213
+
214
+ The following AMOP stages were applied:
215
+ - **Stage 2: Pruning:** {"Enabled" if options['prune'] else "Disabled"} (Percentage: {options['prune_percent']}%)
216
+ - **Stage 3 & 4: Quantization & ONNX Conversion:** Enabled (Dynamic Quantization)
217
+
218
+ ## Performance Metrics
219
+
220
+ {eval_report}
221
+
222
+ ## How to Use
223
+
224
+ This model is in ONNX format and can be run with `optimum-onnxruntime`.
225
+
226
+ ```python
227
+ from optimum.onnxruntime import ORTModelForCausalLM
228
+ from transformers import AutoTokenizer
229
+
230
+ model_id = "{repo_url.repo_id}"
231
+ model = ORTModelForCausalLM.from_pretrained(model_id)
232
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
233
+
234
+ prompt = "The future of AI is"
235
+ inputs = tokenizer(prompt, return_tensors="pt")
236
+ gen_tokens = model.generate(**inputs)
237
+ print(tokenizer.batch_decode(gen_tokens))