broadfield-dev commited on
Commit
e5fb1ab
·
verified ·
1 Parent(s): 641df90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -34
app.py CHANGED
@@ -110,7 +110,7 @@ def stage_3_4_onnx_quantize(model_path: str, calibration_data_path: str):
110
  logging.error(error_msg, exc_info=True)
111
  raise RuntimeError(error_msg)
112
 
113
- def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
114
  log_stream = f"[STAGE 3 & 4] Converting to GGUF with '{quantization_strategy}' quantization...\n"
115
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
116
  model_name = model_id.replace('/', '_')
@@ -119,13 +119,13 @@ def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
119
  output_file = os.path.join(gguf_path, "model.gguf")
120
 
121
  try:
122
- log_stream += "Executing `optimum-cli export gguf` via subprocess...\n"
123
  export_command = [
124
- "optimum-cli", "export", "gguf",
125
- "--model", model_id,
126
- "--quantization_strategy", quantization_strategy,
127
  "--trust-remote-code",
128
- output_file
129
  ]
130
  process = subprocess.run(export_command, check=True, capture_output=True, text=True)
131
  log_stream += process.stdout
@@ -133,7 +133,7 @@ def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
133
  log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
134
  return gguf_path, log_stream
135
  except subprocess.CalledProcessError as e:
136
- error_msg = f"Failed during `optimum-cli export gguf`. Error:\n{e.stderr}"
137
  logging.error(error_msg)
138
  raise RuntimeError(error_msg)
139
 
@@ -201,37 +201,36 @@ def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_p
201
  raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
202
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}{repo_name_suffix}"
203
 
204
- if pipeline_type == "ONNX":
205
- full_log += "Loading base model for pruning...\n"
206
- yield {final_output: "Loading model (1/5)", log_output: full_log}
207
- model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
208
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
209
- full_log += f"Successfully loaded base model '{model_id}'.\n"
210
 
211
- yield {final_output: "Pruning model (2/5)", log_output: full_log}
212
- if do_prune:
213
- model, log = stage_2_prune_model(model, prune_percent)
214
- full_log += log
215
- else:
216
- full_log += "[STAGE 2] Pruning skipped by user.\n"
217
 
218
- temp_model_dir = tempfile.mkdtemp()
219
- model.save_pretrained(temp_model_dir)
220
- tokenizer.save_pretrained(temp_model_dir)
221
- full_log += f"Saved intermediate model to temporary directory: {temp_model_dir}\n"
222
-
 
223
  yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
224
  calib_path = calibration_file.name if onnx_quant_type == "Static" and calibration_file else None
225
  optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calib_path)
226
  full_log += log
227
  options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
228
-
229
  elif pipeline_type == "GGUF":
230
- full_log += "[STAGE 1 & 2] Loading and Pruning are skipped for GGUF pipeline.\n"
231
  yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
232
- optimized_path, log = stage_3_4_gguf_quantize(model_id, gguf_quant_type)
233
  full_log += log
234
- options = {'pipeline_type': 'GGUF', 'quant_type': gguf_quant_type}
235
 
236
  else:
237
  raise ValueError("Invalid pipeline type selected.")
@@ -289,13 +288,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
289
 
290
  with gr.Group(visible=False) as onnx_options:
291
  gr.Markdown("#### ONNX Pipeline Options")
292
- prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
293
- prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
294
  onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
295
  calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
296
 
297
  with gr.Group(visible=False) as gguf_options:
298
  gr.Markdown("#### GGUF Pipeline Options")
 
 
299
  gguf_quant_dropdown = gr.Dropdown(
300
  ["q4_k_m", "q5_k_m", "q8_0", "f16"],
301
  label="GGUF Quantization Strategy",
@@ -311,14 +312,29 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
311
  success_box = gr.Markdown(visible=False)
312
  log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
313
 
 
 
 
 
 
314
  def update_ui_for_pipeline(pipeline_type):
 
 
315
  return {
316
- onnx_options: gr.Group(visible=pipeline_type == "ONNX"),
317
- gguf_options: gr.Group(visible=pipeline_type == "GGUF")
318
  }
319
 
320
  def update_ui_for_quant_type(quant_type):
321
  return gr.File(visible=quant_type == "Static")
 
 
 
 
 
 
 
 
322
 
323
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
324
  onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
@@ -330,8 +346,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
330
  )
331
 
332
  run_button.click(
333
- fn=run_amop_pipeline,
334
- inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
 
 
 
 
 
335
  outputs=[run_button, analyze_button, final_output, log_output, success_box]
336
  )
337
 
 
110
  logging.error(error_msg, exc_info=True)
111
  raise RuntimeError(error_msg)
112
 
113
+ def stage_3_4_gguf_quantize(model_path: str, model_id: str, quantization_strategy: str):
114
  log_stream = f"[STAGE 3 & 4] Converting to GGUF with '{quantization_strategy}' quantization...\n"
115
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
116
  model_name = model_id.replace('/', '_')
 
119
  output_file = os.path.join(gguf_path, "model.gguf")
120
 
121
  try:
122
+ log_stream += "Executing `optimum-gguf-cli` via subprocess...\n"
123
  export_command = [
124
+ "optimum-gguf-cli",
125
+ "--model", model_path,
126
+ "--quantization-strategy", quantization_strategy,
127
  "--trust-remote-code",
128
+ "--output", output_file
129
  ]
130
  process = subprocess.run(export_command, check=True, capture_output=True, text=True)
131
  log_stream += process.stdout
 
133
  log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
134
  return gguf_path, log_stream
135
  except subprocess.CalledProcessError as e:
136
+ error_msg = f"Failed during `optimum-gguf-cli`. Error:\n{e.stderr}"
137
  logging.error(error_msg)
138
  raise RuntimeError(error_msg)
139
 
 
201
  raise RuntimeError("Could not authenticate with Hugging Face Hub. Check your HF_TOKEN.")
202
  repo_id_for_link = f"{whoami['name']}/{model_id.split('/')[-1]}{repo_name_suffix}"
203
 
204
+ full_log += "Loading base model...\n"
205
+ yield {final_output: "Loading model (1/5)", log_output: full_log}
206
+ model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
207
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
208
+ full_log += f"Successfully loaded base model '{model_id}'.\n"
 
209
 
210
+ yield {final_output: "Pruning model (2/5)", log_output: full_log}
211
+ if do_prune:
212
+ model, log = stage_2_prune_model(model, prune_percent)
213
+ full_log += log
214
+ else:
215
+ full_log += "[STAGE 2] Pruning skipped by user.\n"
216
 
217
+ temp_model_dir = tempfile.mkdtemp()
218
+ model.save_pretrained(temp_model_dir)
219
+ tokenizer.save_pretrained(temp_model_dir)
220
+ full_log += f"Saved intermediate model to temporary directory: {temp_model_dir}\n"
221
+
222
+ if pipeline_type == "ONNX":
223
  yield {final_output: "Converting to ONNX (3/5)", log_output: full_log}
224
  calib_path = calibration_file.name if onnx_quant_type == "Static" and calibration_file else None
225
  optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calib_path)
226
  full_log += log
227
  options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
228
+
229
  elif pipeline_type == "GGUF":
 
230
  yield {final_output: "Converting to GGUF (3/5)", log_output: full_log}
231
+ optimized_path, log = stage_3_4_gguf_quantize(temp_model_dir, model_id, gguf_quant_type)
232
  full_log += log
233
+ options = {'pipeline_type': 'GGUF', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': gguf_quant_type}
234
 
235
  else:
236
  raise ValueError("Invalid pipeline type selected.")
 
288
 
289
  with gr.Group(visible=False) as onnx_options:
290
  gr.Markdown("#### ONNX Pipeline Options")
291
+ prune_checkbox_onnx = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
292
+ prune_slider_onnx = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
293
  onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
294
  calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
295
 
296
  with gr.Group(visible=False) as gguf_options:
297
  gr.Markdown("#### GGUF Pipeline Options")
298
+ prune_checkbox_gguf = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before GGUF conversion.")
299
+ prune_slider_gguf = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
300
  gguf_quant_dropdown = gr.Dropdown(
301
  ["q4_k_m", "q5_k_m", "q8_0", "f16"],
302
  label="GGUF Quantization Strategy",
 
312
  success_box = gr.Markdown(visible=False)
313
  log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
314
 
315
+ # Consolidate pruning controls and pass the correct one based on pipeline type
316
+ # This requires a small change in the main run function and the Gradio UI setup
317
+ prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights from the model.", visible=False)
318
+ prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)", visible=False)
319
+
320
  def update_ui_for_pipeline(pipeline_type):
321
+ is_onnx = pipeline_type == "ONNX"
322
+ is_gguf = pipeline_type == "GGUF"
323
  return {
324
+ onnx_options: gr.Group(visible=is_onnx),
325
+ gguf_options: gr.Group(visible=is_gguf)
326
  }
327
 
328
  def update_ui_for_quant_type(quant_type):
329
  return gr.File(visible=quant_type == "Static")
330
+
331
+ def run_wrapper(model_id, pipeline_type, onnx_prune, onnx_prune_p, gguf_prune, gguf_prune_p, onnx_quant, calib_file, gguf_quant):
332
+ # Logic to decide which pruning values to use
333
+ do_prune = onnx_prune if pipeline_type == "ONNX" else gguf_prune
334
+ prune_percent = onnx_prune_p if pipeline_type == "ONNX" else gguf_prune_p
335
+
336
+ # Call the generator-based pipeline runner
337
+ yield from run_amop_pipeline(model_id, pipeline_type, do_prune, prune_percent, onnx_quant, calib_file, gguf_quant)
338
 
339
  pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
340
  onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
 
346
  )
347
 
348
  run_button.click(
349
+ fn=run_wrapper,
350
+ inputs=[
351
+ model_id_input, pipeline_type_radio,
352
+ prune_checkbox_onnx, prune_slider_onnx,
353
+ prune_checkbox_gguf, prune_slider_gguf,
354
+ onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown
355
+ ],
356
  outputs=[run_button, analyze_button, final_output, log_output, success_box]
357
  )
358