Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from huggingface_hub import HfApi
|
|
| 11 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 12 |
from optimum.onnxruntime import ORTQuantizer
|
| 13 |
from optimum.onnxruntime.configuration import AutoQuantizationConfig
|
|
|
|
| 14 |
import torch.nn.utils.prune as prune
|
| 15 |
|
| 16 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
@@ -119,24 +120,21 @@ def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strateg
|
|
| 119 |
output_file = os.path.join(gguf_path, "model.gguf")
|
| 120 |
|
| 121 |
try:
|
| 122 |
-
log_stream += "
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
]
|
| 130 |
-
process = subprocess.run(export_command, check=True, capture_output=True, text=True)
|
| 131 |
-
log_stream += process.stdout
|
| 132 |
-
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
|
| 133 |
log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
|
| 134 |
return gguf_path, log_stream
|
| 135 |
-
except
|
| 136 |
-
error_msg = f"Failed during
|
| 137 |
-
logging.error(error_msg)
|
| 138 |
raise RuntimeError(error_msg)
|
| 139 |
|
|
|
|
| 140 |
def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
|
| 141 |
log_stream = "[STAGE 5] Packaging and Uploading...\n"
|
| 142 |
if not HF_TOKEN:
|
|
@@ -286,17 +284,17 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 286 |
["ONNX", "GGUF"], label="Select Optimization Pipeline", info="GGUF is recommended for LLMs, ONNX for others."
|
| 287 |
)
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
with gr.Group(visible=False) as onnx_options:
|
| 290 |
-
gr.Markdown("#### ONNX
|
| 291 |
-
prune_checkbox_onnx = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
|
| 292 |
-
prune_slider_onnx = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
|
| 293 |
onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
|
| 294 |
calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
|
| 295 |
|
| 296 |
with gr.Group(visible=False) as gguf_options:
|
| 297 |
-
gr.Markdown("#### GGUF
|
| 298 |
-
prune_checkbox_gguf = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before GGUF conversion.")
|
| 299 |
-
prune_slider_gguf = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
|
| 300 |
gguf_quant_dropdown = gr.Dropdown(
|
| 301 |
["q4_k_m", "q5_k_m", "q8_0", "f16"],
|
| 302 |
label="GGUF Quantization Strategy",
|
|
@@ -312,31 +310,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 312 |
success_box = gr.Markdown(visible=False)
|
| 313 |
log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
|
| 314 |
|
| 315 |
-
# Consolidate pruning controls and pass the correct one based on pipeline type
|
| 316 |
-
# This requires a small change in the main run function and the Gradio UI setup
|
| 317 |
-
prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights from the model.", visible=False)
|
| 318 |
-
prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=False)
|
| 319 |
-
|
| 320 |
def update_ui_for_pipeline(pipeline_type):
|
| 321 |
is_onnx = pipeline_type == "ONNX"
|
| 322 |
is_gguf = pipeline_type == "GGUF"
|
|
|
|
| 323 |
return {
|
| 324 |
onnx_options: gr.Group(visible=is_onnx),
|
| 325 |
-
gguf_options: gr.Group(visible=is_gguf)
|
|
|
|
|
|
|
| 326 |
}
|
| 327 |
|
| 328 |
def update_ui_for_quant_type(quant_type):
|
| 329 |
return gr.File(visible=quant_type == "Static")
|
| 330 |
-
|
| 331 |
-
def run_wrapper(model_id, pipeline_type, onnx_prune, onnx_prune_p, gguf_prune, gguf_prune_p, onnx_quant, calib_file, gguf_quant):
|
| 332 |
-
# Logic to decide which pruning values to use
|
| 333 |
-
do_prune = onnx_prune if pipeline_type == "ONNX" else gguf_prune
|
| 334 |
-
prune_percent = onnx_prune_p if pipeline_type == "ONNX" else gguf_prune_p
|
| 335 |
-
|
| 336 |
-
# Call the generator-based pipeline runner
|
| 337 |
-
yield from run_amop_pipeline(model_id, pipeline_type, do_prune, prune_percent, onnx_quant, calib_file, gguf_quant)
|
| 338 |
|
| 339 |
-
pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
|
| 340 |
onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
|
| 341 |
|
| 342 |
analyze_button.click(
|
|
@@ -346,11 +334,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 346 |
)
|
| 347 |
|
| 348 |
run_button.click(
|
| 349 |
-
fn=
|
| 350 |
inputs=[
|
| 351 |
model_id_input, pipeline_type_radio,
|
| 352 |
-
|
| 353 |
-
prune_checkbox_gguf, prune_slider_gguf,
|
| 354 |
onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown
|
| 355 |
],
|
| 356 |
outputs=[run_button, analyze_button, final_output, log_output, success_box]
|
|
|
|
| 11 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 12 |
from optimum.onnxruntime import ORTQuantizer
|
| 13 |
from optimum.onnxruntime.configuration import AutoQuantizationConfig
|
| 14 |
+
from optimum.exporters.gguf import main_export as gguf_export
|
| 15 |
import torch.nn.utils.prune as prune
|
| 16 |
|
| 17 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
| 120 |
output_file = os.path.join(gguf_path, "model.gguf")
|
| 121 |
|
| 122 |
try:
|
| 123 |
+
log_stream += "Calling `optimum.exporters.gguf.main_export` programmatically...\n"
|
| 124 |
+
gguf_export(
|
| 125 |
+
model_id_or_path=model_path,
|
| 126 |
+
output=output_file,
|
| 127 |
+
quantization_strategy=quantization_strategy,
|
| 128 |
+
trust_remote_code=True
|
| 129 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
|
| 131 |
return gguf_path, log_stream
|
| 132 |
+
except Exception as e:
|
| 133 |
+
error_msg = f"Failed during GGUF conversion. Error: {e}"
|
| 134 |
+
logging.error(error_msg, exc_info=True)
|
| 135 |
raise RuntimeError(error_msg)
|
| 136 |
|
| 137 |
+
|
| 138 |
def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
|
| 139 |
log_stream = "[STAGE 5] Packaging and Uploading...\n"
|
| 140 |
if not HF_TOKEN:
|
|
|
|
| 284 |
["ONNX", "GGUF"], label="Select Optimization Pipeline", info="GGUF is recommended for LLMs, ONNX for others."
|
| 285 |
)
|
| 286 |
|
| 287 |
+
# Unified Pruning controls, shown/hidden by parent group
|
| 288 |
+
prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights from the model.")
|
| 289 |
+
prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
|
| 290 |
+
|
| 291 |
with gr.Group(visible=False) as onnx_options:
|
| 292 |
+
gr.Markdown("#### ONNX Quantization Options")
|
|
|
|
|
|
|
| 293 |
onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
|
| 294 |
calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
|
| 295 |
|
| 296 |
with gr.Group(visible=False) as gguf_options:
|
| 297 |
+
gr.Markdown("#### GGUF Quantization Options")
|
|
|
|
|
|
|
| 298 |
gguf_quant_dropdown = gr.Dropdown(
|
| 299 |
["q4_k_m", "q5_k_m", "q8_0", "f16"],
|
| 300 |
label="GGUF Quantization Strategy",
|
|
|
|
| 310 |
success_box = gr.Markdown(visible=False)
|
| 311 |
log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
def update_ui_for_pipeline(pipeline_type):
|
| 314 |
is_onnx = pipeline_type == "ONNX"
|
| 315 |
is_gguf = pipeline_type == "GGUF"
|
| 316 |
+
# Pruning controls are visible for either pipeline type, but grouped logically
|
| 317 |
return {
|
| 318 |
onnx_options: gr.Group(visible=is_onnx),
|
| 319 |
+
gguf_options: gr.Group(visible=is_gguf),
|
| 320 |
+
prune_checkbox: gr.Checkbox(visible=is_onnx or is_gguf),
|
| 321 |
+
prune_slider: gr.Slider(visible=is_onnx or is_gguf)
|
| 322 |
}
|
| 323 |
|
| 324 |
def update_ui_for_quant_type(quant_type):
|
| 325 |
return gr.File(visible=quant_type == "Static")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
+
pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options, prune_checkbox, prune_slider])
|
| 328 |
onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
|
| 329 |
|
| 330 |
analyze_button.click(
|
|
|
|
| 334 |
)
|
| 335 |
|
| 336 |
run_button.click(
|
| 337 |
+
fn=run_amop_pipeline,
|
| 338 |
inputs=[
|
| 339 |
model_id_input, pipeline_type_radio,
|
| 340 |
+
prune_checkbox, prune_slider,
|
|
|
|
| 341 |
onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown
|
| 342 |
],
|
| 343 |
outputs=[run_button, analyze_button, final_output, log_output, success_box]
|