File size: 15,544 Bytes
32de6da
 
 
 
f9beded
 
 
6e5122c
32de6da
6a6f272
f074b57
5a1196d
 
 
 
32de6da
 
 
 
 
 
 
 
 
 
 
97b9b15
6a6f272
22406c4
5fd2de9
6a6f272
97b9b15
 
 
 
 
6a6f272
32de6da
 
 
f9beded
32de6da
6a6f272
32de6da
f9beded
6a6f272
32de6da
6a6f272
32de6da
54b40d5
32de6da
 
 
6a6f272
32de6da
5a1196d
 
 
 
 
 
 
 
 
 
 
 
 
de1d3bb
5a1196d
 
32de6da
5a1196d
6e5122c
22406c4
6e5122c
 
6a6f272
6e5122c
5a1196d
 
 
 
 
 
 
 
 
32de6da
 
5a1196d
32de6da
5a1196d
 
6e5122c
201ed88
97b9b15
 
5a1196d
97b9b15
 
 
fd28273
6e5122c
fd28273
c2a4575
fd28273
6a6f272
 
 
 
 
5a1196d
6a6f272
 
c2a4575
 
22406c4
6a6f272
5a1196d
 
6a6f272
 
0ed6d3e
6a6f272
0ed6d3e
f9beded
5a1196d
32de6da
 
 
f9beded
19216c7
6a6f272
 
5a1196d
6a6f272
54b40d5
5a1196d
6a6f272
19216c7
6a6f272
19216c7
5a1196d
19216c7
6f5c595
f074b57
5a1196d
6a6f272
54b40d5
5a1196d
201ed88
19216c7
21d341d
6a6f272
 
5a1196d
 
22406c4
5a1196d
 
 
 
 
 
 
 
 
201ed88
5a1196d
 
 
e5fb1ab
22406c4
5a1196d
 
f9beded
22406c4
5a1196d
 
f9beded
 
6a6f272
22406c4
c2a4575
19216c7
6a6f272
19216c7
 
 
6a6f272
f9beded
5a1196d
 
32de6da
f9beded
54b40d5
6a6f272
19216c7
 
54b40d5
6a6f272
54b40d5
 
19216c7
6a6f272
5a1196d
 
 
f9beded
6a6f272
5a1196d
201ed88
f9beded
6a6f272
 
f9beded
19216c7
 
6a6f272
54b40d5
6a6f272
f9beded
6a6f272
 
 
21d341d
5a1196d
21d341d
32de6da
19216c7
21d341d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import gradio as gr
import torch
import os
import logging
import time
import tempfile
import shutil
import subprocess
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi
from transformers import AutoConfig, AutoModel, AutoTokenizer
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
import torch.nn.utils.prune as prune

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")

api = HfApi()
OUTPUT_DIR = "optimized_models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# The Dockerfile guarantees these files exist, so we just define the paths.
LLAMA_CPP_DIR = Path("llama.cpp")
LLAMA_CPP_CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert.py"
LLAMA_CPP_QUANTIZE_SCRIPT = LLAMA_CPP_DIR / "quantize"

# Verify that the build was successful during startup
if not LLAMA_CPP_QUANTIZE_SCRIPT.exists():
    error_msg = "FATAL ERROR: llama.cpp binaries not found. The Docker build may have failed."
    logging.error(error_msg)
    raise RuntimeError(error_msg)

def stage_1_analyze_model(model_id: str):
    log_stream = "[STAGE 1] Analyzing model...\n"
    try:
        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True, token=HF_TOKEN)
        model_type = config.model_type
        analysis_report = f"""### Model Analysis Report\n- **Model ID:** `{model_id}`\n- **Architecture:** `{model_type}`"""
        recommendation = ""
        if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type or 'gemma' in model_type:
            recommendation = "**Recommendation:** This is a Large Language Model (LLM). For the best CPU performance, the **GGUF Pipeline** (using llama.cpp) is highly recommended."
        else:
            recommendation = "**Recommendation:** This is likely an encoder model. The **ONNX Pipeline** is recommended."
        log_stream += f"Analysis complete. Architecture: {model_type}.\n"
        return log_stream, analysis_report + "\n" + recommendation, gr.Accordion(open=True)
    except Exception as e:
        error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
        logging.error(error_msg)
        return log_stream + error_msg, "Could not analyze model.", gr.Accordion(open=False)

def stage_2_prune_model(model, prune_percentage: float):
    if prune_percentage == 0:
        return model, "Skipped pruning as percentage was 0."
    log_stream = "[STAGE 2] Pruning model...\n"
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
            prune.remove(module, 'weight')
    log_stream += f"Pruning complete with {prune_percentage}% target.\n"
    return model, log_stream

def stage_3_4_onnx_quantize(model_path_or_id: str, onnx_quant_type: str, calibration_data_path: str):
    log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
    run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
    model_name = model_path_or_id.split('/')[-1]
    onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
    try:
        export_command = ["optimum-cli", "export", "onnx", "--model", model_path_or_id, "--trust-remote-code", onnx_path]
        process = subprocess.run(export_command, check=True, capture_output=True, text=True)
        log_stream += f"Executing `optimum-cli export onnx` for '{model_path_or_id}'...\n{process.stdout}\n"
        if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}")
    try:
        quantizer = ORTQuantizer.from_pretrained(onnx_path)
        log_stream += "Performing DYNAMIC quantization...\n"
        dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
        quantized_path = os.path.join(onnx_path, "quantized-dynamic")
        quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
        log_stream += f"Successfully quantized model to: {quantized_path}\n"
        if not os.path.exists(os.path.join(quantized_path, 'tokenizer_config.json')):
             AutoTokenizer.from_pretrained(model_path_or_id, trust_remote_code=True).save_pretrained(quantized_path)
             log_stream += "Saved new tokenizer files.\n"
        return quantized_path, log_stream
    except Exception as e:
        raise RuntimeError(f"Failed during ONNX quantization step. Error: {e}")

def stage_3_4_gguf_quantize(model_path_or_id: str, original_model_id: str, quantization_strategy: str):
    log_stream = "[STAGE 3 & 4] Converting to GGUF using llama.cpp...\n"
    run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
    model_name = original_model_id.replace('/', '_')
    # Use absolute paths for outputs to avoid issues with changing working directories
    gguf_path = os.path.abspath(os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf"))
    os.makedirs(gguf_path, exist_ok=True)
    f16_gguf_path = os.path.join(gguf_path, "model-f16.gguf")
    quantized_gguf_path = os.path.join(gguf_path, "model.gguf")
    # Use absolute path for model input if it's a local directory
    absolute_model_path = os.path.abspath(model_path_or_id) if os.path.exists(model_path_or_id) else model_path_or_id
    try:
        convert_command = ["python3", "convert.py", absolute_model_path, "--outfile", f16_gguf_path, "--outtype", "f16"]
        process = subprocess.run(convert_command, check=True, capture_output=True, text=True, cwd=str(LLAMA_CPP_DIR))
        log_stream += f"Executing llama.cpp conversion script...\n{process.stdout}\n"
        if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
        quantize_map = {"q4_k_m": "Q4_K_M", "q5_k_m": "Q5_K_M", "q8_0": "Q8_0", "f16": "F16"}
        target_quant_name = quantize_map.get(quantization_strategy.lower(), "Q4_K_M")
        if target_quant_name == "F16":
            log_stream += "Target is F16, renaming file...\n"
            os.rename(f16_gguf_path, quantized_gguf_path)
        else:
            log_stream += f"Quantizing FP16 GGUF to {target_quant_name}...\n"
            quantize_command = ["./quantize", f16_gguf_path, quantized_gguf_path, target_quant_name]
            process = subprocess.run(quantize_command, check=True, capture_output=True, text=True, cwd=str(LLAMA_CPP_DIR))
            log_stream += f"{process.stdout}\n"
            if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
            os.remove(f16_gguf_path)
        return gguf_path, log_stream
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Failed during llama.cpp execution. Error:\n{e.stderr}")
    except Exception as e:
        raise RuntimeError(f"An unexpected error occurred during GGUF conversion. Error: {e}")

def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
    log_stream = "[STAGE 5] Packaging and Uploading...\n"
    if not HF_TOKEN:
        return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
    try:
        repo_name = f"{model_id.split('/')[-1]}-amop-cpu-{options['pipeline_type'].lower()}"
        repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
        template_file = "model_card_template_gguf.md" if options['pipeline_type'] == "GGUF" else "model_card_template.md"
        with open(template_file, "r", encoding="utf-8") as f: template_content = f.read()
        model_card_content = template_content.format(repo_name=repo_name, model_id=model_id, optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), pruning_status="Enabled" if options.get('prune', False) else "Disabled", pruning_percent=options.get('prune_percent', 0), quant_type=options.get('quant_type', 'N/A'), repo_id=repo_url.repo_id, pipeline_log=pipeline_log)
        with open(os.path.join(optimized_model_path, "README.md"), "w", encoding="utf-8") as f: f.write(model_card_content)
        api.upload_folder(folder_path=optimized_model_path, repo_id=repo_url.repo_id, repo_type="model", token=HF_TOKEN)
        log_stream += "Upload complete.\n"
        return f"Success! Your optimized model is available at: huggingface.co/{repo_url.repo_id}", log_stream
    except Exception as e:
        raise RuntimeError(f"Failed to upload to the Hub. Error: {e}")

def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_percent: float, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
    if not model_id:
        yield {log_output: "Please enter a Model ID.", final_output: "Idle"}
        return
    initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated for '{model_id}'.\n"
    yield {run_button: gr.Button(interactive=False, value="πŸš€ Running..."), analyze_button: gr.Button(interactive=False), final_output: f"RUNNING ({pipeline_type})", log_output: initial_log}
    full_log = initial_log
    temp_model_dir = None
    model_path_or_id = model_id
    try:
        whoami = api.whoami(token=HF_TOKEN)
        if not whoami: raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
        repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}-amop-cpu-{pipeline_type.lower()}"
        if do_prune and prune_percent > 0:
            full_log += f"\n[WARNING] Pruning is memory-intensive and may fail for large models.\n"
            yield {final_output: "Loading model (1/5)", log_output: full_log}
            model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
            full_log += f"Successfully loaded '{model_id}'.\n"
            yield {final_output: "Pruning model (2/5)", log_output: full_log}
            model, log = stage_2_prune_model(model, prune_percent)
            full_log += log
            temp_model_dir = tempfile.mkdtemp()
            model.save_pretrained(temp_model_dir)
            tokenizer.save_pretrained(temp_model_dir)
            model_path_or_id = temp_model_dir
            full_log += f"Saved intermediate pruned model to {temp_model_dir}\n"
        else:
            full_log += "Pruning skipped.\n"
        if pipeline_type == "ONNX":
            yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
            optimized_path, log = stage_3_4_onnx_quantize(model_path_or_id, onnx_quant_type, calibration_file.name if onnx_quant_type == "Static" and calibration_file else None)
            options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
        elif pipeline_type == "GGUF":
            yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
            optimized_path, log = stage_3_4_gguf_quantize(model_path_or_id, model_id, gguf_quant_type)
            options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
        else:
            raise ValueError("Invalid pipeline type selected.")
        full_log += log
        yield {final_output: "Packaging & Uploading (4/5)", log_output: full_log}
        final_message, log = stage_5_package_and_upload(model_id, optimized_model_path, full_log, options)
        full_log += log
        yield {final_output: gr.update(value="SUCCESS", label="Status"), log_output: full_log, success_box: gr.Markdown(f"βœ… **Success!** Model available: [{repo_id_for_link}](https://huggingface.co/{repo_id_for_link})", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
    except Exception as e:
        logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
        full_log += f"\n[ERROR] Pipeline failed: {e}"
        yield {final_output: gr.update(value="ERROR", label="Status"), log_output: full_log, success_box: gr.Markdown(f"❌ **An error occurred.** Check logs for details.", visible=True), run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"), analyze_button: gr.Button(interactive=True, value="Analyze Model")}
    finally:
        if temp_model_dir and os.path.exists(temp_model_dir):
            shutil.rmtree(temp_model_dir)

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸš€ AMOP: Adaptive Model Optimization Pipeline")
    if not HF_TOKEN: gr.Warning("HF_TOKEN not set! The final 'upload' step will be skipped.")
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Select a Model")
            model_id_input = gr.Textbox(label="Hugging Face Model ID", placeholder="e.g., gpt2, google/gemma-2b")
            analyze_button = gr.Button("πŸ” Analyze Model", variant="secondary")
            with gr.Accordion("βš™οΈ 2. Configure Optimization", open=False) as optimization_accordion:
                analysis_report_output = gr.Markdown()
                pipeline_type_radio = gr.Radio(["ONNX", "GGUF"], label="Select Optimization Pipeline")
                gr.Warning("Pruning requires high RAM and may fail for models >2B parameters on free Spaces.")
                prune_checkbox = gr.Checkbox(label="Enable Pruning (Optional)", value=False, info="Removes redundant weights before quantization.")
                prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=True)
                with gr.Group(visible=False) as onnx_options:
                    gr.Markdown("#### ONNX Options")
                    onnx_quant_radio = gr.Radio(["Dynamic"], label="Quantization Type", value="Dynamic", info="Static quantization via UI is not supported.")
                    calibration_file_upload = gr.File(visible=False)
                with gr.Group(visible=False) as gguf_options:
                    gr.Markdown("#### GGUF Options")
                    gguf_quant_dropdown = gr.Dropdown(["q4_k_m", "q5_k_m", "q8_0", "f16"], label="Quantization Strategy", value="q4_k_m")
                run_button = gr.Button("πŸš€ Run Optimization Pipeline", variant="primary")
        with gr.Column(scale=2):
            gr.Markdown("### Pipeline Status & Logs")
            final_output = gr.Label(value="Idle", label="Status")
            success_box = gr.Markdown(visible=False)
            log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
    def update_ui_for_pipeline(pipeline_type):
        return {onnx_options: gr.Group(visible=pipeline_type=="ONNX"), gguf_options: gr.Group(visible=pipeline_type=="GGUF")}
    pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
    analyze_button.click(fn=stage_1_analyze_model, inputs=[model_id_input], outputs=[log_output, analysis_report_output, optimization_accordion])
    run_button.click(fn=run_amop_pipeline, 
                     inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown], 
                     outputs=[run_button, analyze_button, final_output, log_output, success_box])

if __name__ == "__main__":
    demo.queue().launch(debug=True)