Spaces:

broadfield-dev
/

AMOP

Paused

App Files Files Community

broadfield-dev commited on Sep 14

Commit

0ed6d3e

verified ·

1 Parent(s): e5fb1ab

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -38

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from huggingface_hub import HfApi
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoQuantizationConfig
 import torch.nn.utils.prune as prune
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -119,24 +120,21 @@ def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strateg
     output_file = os.path.join(gguf_path, "model.gguf")
     try:
-        log_stream += "Executing `optimum-gguf-cli` via subprocess...\n"
-        export_command = [
-            "optimum-gguf-cli",
-            "--model", model_path,
-            "--quantization-strategy", quantization_strategy,
-            "--trust-remote-code",
-            "--output", output_file
-        ]
-        process = subprocess.run(export_command, check=True, capture_output=True, text=True)
-        log_stream += process.stdout
-        if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
         log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
         return gguf_path, log_stream
-    except subprocess.CalledProcessError as e:
-        error_msg = f"Failed during `optimum-gguf-cli`. Error:\n{e.stderr}"
-        logging.error(error_msg)
         raise RuntimeError(error_msg)
 def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
     log_stream = "[STAGE 5] Packaging and Uploading...\n"
     if not HF_TOKEN:
@@ -286,17 +284,17 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                     ["ONNX", "GGUF"], label="Select Optimization Pipeline", info="GGUF is recommended for LLMs, ONNX for others."
                 )
                 with gr.Group(visible=False) as onnx_options:
-                    gr.Markdown("#### ONNX Pipeline Options")
-                    prune_checkbox_onnx = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
-                    prune_slider_onnx = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                     onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
                     calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
                 with gr.Group(visible=False) as gguf_options:
-                    gr.Markdown("#### GGUF Pipeline Options")
-                    prune_checkbox_gguf = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before GGUF conversion.")
-                    prune_slider_gguf = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                     gguf_quant_dropdown = gr.Dropdown(
                         ["q4_k_m", "q5_k_m", "q8_0", "f16"],
                         label="GGUF Quantization Strategy",
@@ -312,31 +310,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             success_box = gr.Markdown(visible=False)
             log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
-    # Consolidate pruning controls and pass the correct one based on pipeline type
-    # This requires a small change in the main run function and the Gradio UI setup
-    prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights from the model.", visible=False)
-    prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=False)
     def update_ui_for_pipeline(pipeline_type):
         is_onnx = pipeline_type == "ONNX"
         is_gguf = pipeline_type == "GGUF"
         return {
             onnx_options: gr.Group(visible=is_onnx),
-            gguf_options: gr.Group(visible=is_gguf)
         }
     def update_ui_for_quant_type(quant_type):
         return gr.File(visible=quant_type == "Static")
-    def run_wrapper(model_id, pipeline_type, onnx_prune, onnx_prune_p, gguf_prune, gguf_prune_p, onnx_quant, calib_file, gguf_quant):
-        # Logic to decide which pruning values to use
-        do_prune = onnx_prune if pipeline_type == "ONNX" else gguf_prune
-        prune_percent = onnx_prune_p if pipeline_type == "ONNX" else gguf_prune_p
-        # Call the generator-based pipeline runner
-        yield from run_amop_pipeline(model_id, pipeline_type, do_prune, prune_percent, onnx_quant, calib_file, gguf_quant)
-    pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
     onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
     analyze_button.click(
@@ -346,11 +334,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     )
     run_button.click(
-        fn=run_wrapper,
         inputs=[
             model_id_input, pipeline_type_radio,
-            prune_checkbox_onnx, prune_slider_onnx,
-            prune_checkbox_gguf, prune_slider_gguf,
             onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown
         ],
         outputs=[run_button, analyze_button, final_output, log_output, success_box]

 from transformers import AutoConfig, AutoModel, AutoTokenizer
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoQuantizationConfig
+from optimum.exporters.gguf import main_export as gguf_export
 import torch.nn.utils.prune as prune
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     output_file = os.path.join(gguf_path, "model.gguf")
     try:
+        log_stream += "Calling `optimum.exporters.gguf.main_export` programmatically...\n"
+        gguf_export(
+            model_id_or_path=model_path,
+            output=output_file,
+            quantization_strategy=quantization_strategy,
+            trust_remote_code=True
+        )
         log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
         return gguf_path, log_stream
+    except Exception as e:
+        error_msg = f"Failed during GGUF conversion. Error: {e}"
+        logging.error(error_msg, exc_info=True)
         raise RuntimeError(error_msg)
 def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
     log_stream = "[STAGE 5] Packaging and Uploading...\n"
     if not HF_TOKEN:
                     ["ONNX", "GGUF"], label="Select Optimization Pipeline", info="GGUF is recommended for LLMs, ONNX for others."
                 )
+                # Unified Pruning controls, shown/hidden by parent group
+                prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights from the model.")
+                prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                 with gr.Group(visible=False) as onnx_options:
+                    gr.Markdown("#### ONNX Quantization Options")
                     onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
                     calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
                 with gr.Group(visible=False) as gguf_options:
+                    gr.Markdown("#### GGUF Quantization Options")
                     gguf_quant_dropdown = gr.Dropdown(
                         ["q4_k_m", "q5_k_m", "q8_0", "f16"],
                         label="GGUF Quantization Strategy",
             success_box = gr.Markdown(visible=False)
             log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
     def update_ui_for_pipeline(pipeline_type):
         is_onnx = pipeline_type == "ONNX"
         is_gguf = pipeline_type == "GGUF"
+        # Pruning controls are visible for either pipeline type, but grouped logically
         return {
             onnx_options: gr.Group(visible=is_onnx),
+            gguf_options: gr.Group(visible=is_gguf),
+            prune_checkbox: gr.Checkbox(visible=is_onnx or is_gguf),
+            prune_slider: gr.Slider(visible=is_onnx or is_gguf)
         }
     def update_ui_for_quant_type(quant_type):
         return gr.File(visible=quant_type == "Static")
+    pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options, prune_checkbox, prune_slider])
     onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
     analyze_button.click(
     )
     run_button.click(
+        fn=run_amop_pipeline,
         inputs=[
             model_id_input, pipeline_type_radio,
+            prune_checkbox, prune_slider,
             onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown
         ],
         outputs=[run_button, analyze_button, final_output, log_output, success_box]