Spaces:

broadfield-dev
/

AMOP

Paused

App Files Files Community

broadfield-dev commited on Sep 14

Commit

e5fb1ab

verified ·

1 Parent(s): 641df90

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -34

app.py CHANGED Viewed

@@ -110,7 +110,7 @@ def stage_3_4_onnx_quantize(model_path: str, calibration_data_path: str):
         logging.error(error_msg, exc_info=True)
         raise RuntimeError(error_msg)
-def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
     log_stream = f"[STAGE 3 & 4] Converting to GGUF with '{quantization_strategy}' quantization...\n"
     run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
     model_name = model_id.replace('/', '_')
@@ -119,13 +119,13 @@ def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
     output_file = os.path.join(gguf_path, "model.gguf")
     try:
-        log_stream += "Executing `optimum-cli export gguf` via subprocess...\n"
         export_command = [
-            "optimum-cli", "export", "gguf",
-            "--model", model_id,
-            "--quantization_strategy", quantization_strategy,
             "--trust-remote-code",
-            output_file
         ]
         process = subprocess.run(export_command, check=True, capture_output=True, text=True)
         log_stream += process.stdout
@@ -133,7 +133,7 @@ def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
         log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
         return gguf_path, log_stream
     except subprocess.CalledProcessError as e:
-        error_msg = f"Failed during `optimum-cli export gguf`. Error:\n{e.stderr}"
         logging.error(error_msg)
         raise RuntimeError(error_msg)
@@ -201,37 +201,36 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
              raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
         repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}{repo_name_suffix}"
-        if pipeline_type == "ONNX":
-            full_log += "Loading base model for pruning...\n"
-            yield {final_output: "Loading model (1/5)", log_output: full_log}
-            model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-            full_log += f"Successfully loaded base model '{model_id}'.\n"
-            yield {final_output: "Pruning model (2/5)", log_output: full_log}
-            if do_prune:
-                model, log = stage_2_prune_model(model, prune_percent)
-                full_log += log
-            else:
-                full_log += "[STAGE 2] Pruning skipped by user.\n"
-            temp_model_dir = tempfile.mkdtemp()
-            model.save_pretrained(temp_model_dir)
-            tokenizer.save_pretrained(temp_model_dir)
-            full_log += f"Saved intermediate model to temporary directory: {temp_model_dir}\n"
             yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
             calib_path = calibration_file.name if onnx_quant_type == "Static" and calibration_file else None
             optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calib_path)
             full_log += log
             options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
         elif pipeline_type == "GGUF":
-            full_log += "[STAGE 1 & 2] Loading and Pruning are skipped for GGUF pipeline.\n"
             yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
-            optimized_path, log = stage_3_4_gguf_quantize(model_id, gguf_quant_type)
             full_log += log
-            options = {'pipeline_type': 'GGUF', 'quant_type': gguf_quant_type}
         else:
             raise ValueError("Invalid pipeline type selected.")
@@ -289,13 +288,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 with gr.Group(visible=False) as onnx_options:
                     gr.Markdown("#### ONNX Pipeline Options")
-                    prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
-                    prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                     onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
                     calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
                 with gr.Group(visible=False) as gguf_options:
                     gr.Markdown("#### GGUF Pipeline Options")
                     gguf_quant_dropdown = gr.Dropdown(
                         ["q4_k_m", "q5_k_m", "q8_0", "f16"],
                         label="GGUF Quantization Strategy",
@@ -311,14 +312,29 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             success_box = gr.Markdown(visible=False)
             log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
     def update_ui_for_pipeline(pipeline_type):
         return {
-            onnx_options: gr.Group(visible=pipeline_type == "ONNX"),
-            gguf_options: gr.Group(visible=pipeline_type == "GGUF")
         }
     def update_ui_for_quant_type(quant_type):
         return gr.File(visible=quant_type == "Static")
     pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
     onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
@@ -330,8 +346,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     )
     run_button.click(
-        fn=run_amop_pipeline,
-        inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
         outputs=[run_button, analyze_button, final_output, log_output, success_box]
     )

         logging.error(error_msg, exc_info=True)
         raise RuntimeError(error_msg)
+def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strategy: str):
     log_stream = f"[STAGE 3 & 4] Converting to GGUF with '{quantization_strategy}' quantization...\n"
     run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
     model_name = model_id.replace('/', '_')
     output_file = os.path.join(gguf_path, "model.gguf")
     try:
+        log_stream += "Executing `optimum-gguf-cli` via subprocess...\n"
         export_command = [
+            "optimum-gguf-cli",
+            "--model", model_path,
+            "--quantization-strategy", quantization_strategy,
             "--trust-remote-code",
+            "--output", output_file
         ]
         process = subprocess.run(export_command, check=True, capture_output=True, text=True)
         log_stream += process.stdout
         log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
         return gguf_path, log_stream
     except subprocess.CalledProcessError as e:
+        error_msg = f"Failed during `optimum-gguf-cli`. Error:\n{e.stderr}"
         logging.error(error_msg)
         raise RuntimeError(error_msg)
              raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
         repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}{repo_name_suffix}"
+        full_log += "Loading base model...\n"
+        yield {final_output: "Loading model (1/5)", log_output: full_log}
+        model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        full_log += f"Successfully loaded base model '{model_id}'.\n"
+        yield {final_output: "Pruning model (2/5)", log_output: full_log}
+        if do_prune:
+            model, log = stage_2_prune_model(model, prune_percent)
+            full_log += log
+        else:
+            full_log += "[STAGE 2] Pruning skipped by user.\n"
+        temp_model_dir = tempfile.mkdtemp()
+        model.save_pretrained(temp_model_dir)
+        tokenizer.save_pretrained(temp_model_dir)
+        full_log += f"Saved intermediate model to temporary directory: {temp_model_dir}\n"
+        if pipeline_type == "ONNX":
             yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
             calib_path = calibration_file.name if onnx_quant_type == "Static" and calibration_file else None
             optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calib_path)
             full_log += log
             options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
         elif pipeline_type == "GGUF":
             yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
+            optimized_path, log = stage_3_4_gguf_quantize(temp_model_dir, model_id, gguf_quant_type)
             full_log += log
+            options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
         else:
             raise ValueError("Invalid pipeline type selected.")
                 with gr.Group(visible=False) as onnx_options:
                     gr.Markdown("#### ONNX Pipeline Options")
+                    prune_checkbox_onnx = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
+                    prune_slider_onnx = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                     onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
                     calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
                 with gr.Group(visible=False) as gguf_options:
                     gr.Markdown("#### GGUF Pipeline Options")
+                    prune_checkbox_gguf = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before GGUF conversion.")
+                    prune_slider_gguf = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
                     gguf_quant_dropdown = gr.Dropdown(
                         ["q4_k_m", "q5_k_m", "q8_0", "f16"],
                         label="GGUF Quantization Strategy",
             success_box = gr.Markdown(visible=False)
             log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
+    # Consolidate pruning controls and pass the correct one based on pipeline type
+    # This requires a small change in the main run function and the Gradio UI setup
+    prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights from the model.", visible=False)
+    prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=False)
     def update_ui_for_pipeline(pipeline_type):
+        is_onnx = pipeline_type == "ONNX"
+        is_gguf = pipeline_type == "GGUF"
         return {
+            onnx_options: gr.Group(visible=is_onnx),
+            gguf_options: gr.Group(visible=is_gguf)
         }
     def update_ui_for_quant_type(quant_type):
         return gr.File(visible=quant_type == "Static")
+    def run_wrapper(model_id, pipeline_type, onnx_prune, onnx_prune_p, gguf_prune, gguf_prune_p, onnx_quant, calib_file, gguf_quant):
+        # Logic to decide which pruning values to use
+        do_prune = onnx_prune if pipeline_type == "ONNX" else gguf_prune
+        prune_percent = onnx_prune_p if pipeline_type == "ONNX" else gguf_prune_p
+        # Call the generator-based pipeline runner
+        yield from run_amop_pipeline(model_id, pipeline_type, do_prune, prune_percent, onnx_quant, calib_file, gguf_quant)
     pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
     onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
     )
     run_button.click(
+        fn=run_wrapper,
+        inputs=[
+            model_id_input, pipeline_type_radio,
+            prune_checkbox_onnx, prune_slider_onnx,
+            prune_checkbox_gguf, prune_slider_gguf,
+            onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown
+        ],
         outputs=[run_button, analyze_button, final_output, log_output, success_box]
     )