OpenElla-GGUF / app.py
ItsMeDevRoland's picture
Update app.py
97ef35f verified
#!/usr/bin/env python3
# Automatic dependency installation
import sys
import subprocess
import importlib
def install_package(package):
"""Automatically install a package if not already present"""
try:
importlib.import_module(package)
except ImportError:
print(f"{package} not found. Installing...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
# List of required packages
REQUIRED_PACKAGES = [
'gradio',
'torch',
'transformers',
'huggingface_hub',
'llama-cpp-python'
]
# Install required packages
for package in REQUIRED_PACKAGES:
install_package(package)
# Now import the installed packages
import gradio as gr
import torch
from huggingface_hub import hf_hub_download
import os
# Efficient GGUF model download and loading
def download_and_load_model(
repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF",
filename="unsloth.Q4_K_M.gguf"
):
"""
Download GGUF model from HuggingFace if not exists
Args:
repo_id (str): HuggingFace repository ID
filename (str): Specific GGUF model filename
Returns:
tuple: Loaded model and model path
"""
try:
# Try to import llama-cpp directly to ensure it's available
from llama_cpp import Llama
except ImportError:
print("Critical error: llama-cpp-python could not be imported.")
sys.exit(1)
# Determine download directory (use a cache directory)
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
os.makedirs(cache_dir, exist_ok=True)
# Check if model already exists
model_path = os.path.join(cache_dir, filename)
if not os.path.exists(model_path):
print(f"Downloading {filename} from {repo_id}...")
try:
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=cache_dir,
local_dir_use_symlinks=False
)
except Exception as e:
print(f"Error downloading model: {e}")
sys.exit(1)
print(f"Using model at: {model_path}")
# Initialize the model with optimized CPU settings
model = Llama(
model_path=model_path,
n_ctx=2048, # Context window size
n_batch=512, # Batch size for prompt processing
n_threads=max(torch.get_num_threads() // 2, 1), # Use half of available threads
n_gpu_layers=-1, # Use CPU
seed=-1, # Random seed
verbose=True # Enable verbose logging for download confirmation
)
return model, model_path
# Global model initialization
try:
llm_model, MODEL_PATH = download_and_load_model()
except Exception as e:
print(f"Fatal error initializing model: {e}")
sys.exit(1)
def respond(
message,
history,
system_message="You are a friendly Chatbot.",
max_tokens=512,
temperature=0.7,
top_p=0.95,
):
"""
Generate a response using the GGUF model
Args:
message (str): User's input message
history (list): Chat history
system_message (str): System prompt
max_tokens (int): Maximum number of tokens to generate
temperature (float): Sampling temperature
top_p (float): Nucleus sampling probability threshold
Returns:
str: Generated response
"""
# Prepare the full prompt with system message and history
full_prompt = system_message + "\n\n"
# Add chat history
for user, assistant in history:
if user:
full_prompt += f"User: {user}\n"
if assistant:
full_prompt += f"Assistant: {assistant}\n"
# Add current message
full_prompt += f"User: {message}\n"
full_prompt += "Assistant: "
# Generate response
try:
# Use the most basic generation method
response = llm_model(
prompt=full_prompt,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)['choices'][0]['text']
return response
except Exception as e:
print(f"Error generating response: {e}")
return f"An error occurred: {e}"
# Create Gradio interface with updated configuration
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
]
)
if __name__ == "__main__":
# Optional: Add some system checks
print(f"Available CPU threads: {torch.get_num_threads()}")
print(f"Model path: {MODEL_PATH}")
# Launch the Gradio interface with compatible parameters
demo.launch(
show_api=False, # Disable API endpoint
share=False # Do not create public URL
)