Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -110,7 +110,7 @@ def stage_3_4_onnx_quantize(model_path: str, calibration_data_path: str):
|
|
| 110 |
logging.error(error_msg, exc_info=True)
|
| 111 |
raise RuntimeError(error_msg)
|
| 112 |
|
| 113 |
-
def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
|
| 114 |
log_stream = f"[STAGE 3 & 4] Converting to GGUF with '{quantization_strategy}' quantization...\n"
|
| 115 |
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 116 |
model_name = model_id.replace('/', '_')
|
|
@@ -119,13 +119,13 @@ def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
|
|
| 119 |
output_file = os.path.join(gguf_path, "model.gguf")
|
| 120 |
|
| 121 |
try:
|
| 122 |
-
log_stream += "Executing `optimum-cli
|
| 123 |
export_command = [
|
| 124 |
-
"optimum-cli",
|
| 125 |
-
"--model",
|
| 126 |
-
"--
|
| 127 |
"--trust-remote-code",
|
| 128 |
-
output_file
|
| 129 |
]
|
| 130 |
process = subprocess.run(export_command, check=True, capture_output=True, text=True)
|
| 131 |
log_stream += process.stdout
|
|
@@ -133,7 +133,7 @@ def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
|
|
| 133 |
log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
|
| 134 |
return gguf_path, log_stream
|
| 135 |
except subprocess.CalledProcessError as e:
|
| 136 |
-
error_msg = f"Failed during `optimum-cli
|
| 137 |
logging.error(error_msg)
|
| 138 |
raise RuntimeError(error_msg)
|
| 139 |
|
|
@@ -201,37 +201,36 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
|
|
| 201 |
raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
|
| 202 |
repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}{repo_name_suffix}"
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
full_log += f"Successfully loaded base model '{model_id}'.\n"
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
| 223 |
yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
|
| 224 |
calib_path = calibration_file.name if onnx_quant_type == "Static" and calibration_file else None
|
| 225 |
optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calib_path)
|
| 226 |
full_log += log
|
| 227 |
options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
|
| 228 |
-
|
| 229 |
elif pipeline_type == "GGUF":
|
| 230 |
-
full_log += "[STAGE 1 & 2] Loading and Pruning are skipped for GGUF pipeline.\n"
|
| 231 |
yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
|
| 232 |
-
optimized_path, log = stage_3_4_gguf_quantize(model_id, gguf_quant_type)
|
| 233 |
full_log += log
|
| 234 |
-
options = {'pipeline_type': 'GGUF', 'quant_type': gguf_quant_type}
|
| 235 |
|
| 236 |
else:
|
| 237 |
raise ValueError("Invalid pipeline type selected.")
|
|
@@ -289,13 +288,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 289 |
|
| 290 |
with gr.Group(visible=False) as onnx_options:
|
| 291 |
gr.Markdown("#### ONNX Pipeline Options")
|
| 292 |
-
|
| 293 |
-
|
| 294 |
onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
|
| 295 |
calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
|
| 296 |
|
| 297 |
with gr.Group(visible=False) as gguf_options:
|
| 298 |
gr.Markdown("#### GGUF Pipeline Options")
|
|
|
|
|
|
|
| 299 |
gguf_quant_dropdown = gr.Dropdown(
|
| 300 |
["q4_k_m", "q5_k_m", "q8_0", "f16"],
|
| 301 |
label="GGUF Quantization Strategy",
|
|
@@ -311,14 +312,29 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 311 |
success_box = gr.Markdown(visible=False)
|
| 312 |
log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
def update_ui_for_pipeline(pipeline_type):
|
|
|
|
|
|
|
| 315 |
return {
|
| 316 |
-
onnx_options: gr.Group(visible=
|
| 317 |
-
gguf_options: gr.Group(visible=
|
| 318 |
}
|
| 319 |
|
| 320 |
def update_ui_for_quant_type(quant_type):
|
| 321 |
return gr.File(visible=quant_type == "Static")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
|
| 324 |
onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
|
|
@@ -330,8 +346,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 330 |
)
|
| 331 |
|
| 332 |
run_button.click(
|
| 333 |
-
fn=
|
| 334 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
outputs=[run_button, analyze_button, final_output, log_output, success_box]
|
| 336 |
)
|
| 337 |
|
|
|
|
| 110 |
logging.error(error_msg, exc_info=True)
|
| 111 |
raise RuntimeError(error_msg)
|
| 112 |
|
| 113 |
+
def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strategy: str):
|
| 114 |
log_stream = f"[STAGE 3 & 4] Converting to GGUF with '{quantization_strategy}' quantization...\n"
|
| 115 |
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 116 |
model_name = model_id.replace('/', '_')
|
|
|
|
| 119 |
output_file = os.path.join(gguf_path, "model.gguf")
|
| 120 |
|
| 121 |
try:
|
| 122 |
+
log_stream += "Executing `optimum-gguf-cli` via subprocess...\n"
|
| 123 |
export_command = [
|
| 124 |
+
"optimum-gguf-cli",
|
| 125 |
+
"--model", model_path,
|
| 126 |
+
"--quantization-strategy", quantization_strategy,
|
| 127 |
"--trust-remote-code",
|
| 128 |
+
"--output", output_file
|
| 129 |
]
|
| 130 |
process = subprocess.run(export_command, check=True, capture_output=True, text=True)
|
| 131 |
log_stream += process.stdout
|
|
|
|
| 133 |
log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
|
| 134 |
return gguf_path, log_stream
|
| 135 |
except subprocess.CalledProcessError as e:
|
| 136 |
+
error_msg = f"Failed during `optimum-gguf-cli`. Error:\n{e.stderr}"
|
| 137 |
logging.error(error_msg)
|
| 138 |
raise RuntimeError(error_msg)
|
| 139 |
|
|
|
|
| 201 |
raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
|
| 202 |
repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}{repo_name_suffix}"
|
| 203 |
|
| 204 |
+
full_log += "Loading base model...\n"
|
| 205 |
+
yield {final_output: "Loading model (1/5)", log_output: full_log}
|
| 206 |
+
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
|
| 207 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 208 |
+
full_log += f"Successfully loaded base model '{model_id}'.\n"
|
|
|
|
| 209 |
|
| 210 |
+
yield {final_output: "Pruning model (2/5)", log_output: full_log}
|
| 211 |
+
if do_prune:
|
| 212 |
+
model, log = stage_2_prune_model(model, prune_percent)
|
| 213 |
+
full_log += log
|
| 214 |
+
else:
|
| 215 |
+
full_log += "[STAGE 2] Pruning skipped by user.\n"
|
| 216 |
|
| 217 |
+
temp_model_dir = tempfile.mkdtemp()
|
| 218 |
+
model.save_pretrained(temp_model_dir)
|
| 219 |
+
tokenizer.save_pretrained(temp_model_dir)
|
| 220 |
+
full_log += f"Saved intermediate model to temporary directory: {temp_model_dir}\n"
|
| 221 |
+
|
| 222 |
+
if pipeline_type == "ONNX":
|
| 223 |
yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
|
| 224 |
calib_path = calibration_file.name if onnx_quant_type == "Static" and calibration_file else None
|
| 225 |
optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calib_path)
|
| 226 |
full_log += log
|
| 227 |
options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
|
| 228 |
+
|
| 229 |
elif pipeline_type == "GGUF":
|
|
|
|
| 230 |
yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
|
| 231 |
+
optimized_path, log = stage_3_4_gguf_quantize(temp_model_dir, model_id, gguf_quant_type)
|
| 232 |
full_log += log
|
| 233 |
+
options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
|
| 234 |
|
| 235 |
else:
|
| 236 |
raise ValueError("Invalid pipeline type selected.")
|
|
|
|
| 288 |
|
| 289 |
with gr.Group(visible=False) as onnx_options:
|
| 290 |
gr.Markdown("#### ONNX Pipeline Options")
|
| 291 |
+
prune_checkbox_onnx = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
|
| 292 |
+
prune_slider_onnx = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
|
| 293 |
onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
|
| 294 |
calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
|
| 295 |
|
| 296 |
with gr.Group(visible=False) as gguf_options:
|
| 297 |
gr.Markdown("#### GGUF Pipeline Options")
|
| 298 |
+
prune_checkbox_gguf = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before GGUF conversion.")
|
| 299 |
+
prune_slider_gguf = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
|
| 300 |
gguf_quant_dropdown = gr.Dropdown(
|
| 301 |
["q4_k_m", "q5_k_m", "q8_0", "f16"],
|
| 302 |
label="GGUF Quantization Strategy",
|
|
|
|
| 312 |
success_box = gr.Markdown(visible=False)
|
| 313 |
log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
|
| 314 |
|
| 315 |
+
# Consolidate pruning controls and pass the correct one based on pipeline type
|
| 316 |
+
# This requires a small change in the main run function and the Gradio UI setup
|
| 317 |
+
prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights from the model.", visible=False)
|
| 318 |
+
prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=False)
|
| 319 |
+
|
| 320 |
def update_ui_for_pipeline(pipeline_type):
|
| 321 |
+
is_onnx = pipeline_type == "ONNX"
|
| 322 |
+
is_gguf = pipeline_type == "GGUF"
|
| 323 |
return {
|
| 324 |
+
onnx_options: gr.Group(visible=is_onnx),
|
| 325 |
+
gguf_options: gr.Group(visible=is_gguf)
|
| 326 |
}
|
| 327 |
|
| 328 |
def update_ui_for_quant_type(quant_type):
|
| 329 |
return gr.File(visible=quant_type == "Static")
|
| 330 |
+
|
| 331 |
+
def run_wrapper(model_id, pipeline_type, onnx_prune, onnx_prune_p, gguf_prune, gguf_prune_p, onnx_quant, calib_file, gguf_quant):
|
| 332 |
+
# Logic to decide which pruning values to use
|
| 333 |
+
do_prune = onnx_prune if pipeline_type == "ONNX" else gguf_prune
|
| 334 |
+
prune_percent = onnx_prune_p if pipeline_type == "ONNX" else gguf_prune_p
|
| 335 |
+
|
| 336 |
+
# Call the generator-based pipeline runner
|
| 337 |
+
yield from run_amop_pipeline(model_id, pipeline_type, do_prune, prune_percent, onnx_quant, calib_file, gguf_quant)
|
| 338 |
|
| 339 |
pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
|
| 340 |
onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
|
|
|
|
| 346 |
)
|
| 347 |
|
| 348 |
run_button.click(
|
| 349 |
+
fn=run_wrapper,
|
| 350 |
+
inputs=[
|
| 351 |
+
model_id_input, pipeline_type_radio,
|
| 352 |
+
prune_checkbox_onnx, prune_slider_onnx,
|
| 353 |
+
prune_checkbox_gguf, prune_slider_gguf,
|
| 354 |
+
onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown
|
| 355 |
+
],
|
| 356 |
outputs=[run_button, analyze_button, final_output, log_output, success_box]
|
| 357 |
)
|
| 358 |
|