Spaces:

N-Bot-Int
/

OpenElla-GGUF

Sleeping

App Files Files Community

OpenElla-GGUF / app.py

ItsMeDevRoland

Update app.py

97ef35f verified 9 months ago

raw

history blame contribute delete

5.13 kB

	#!/usr/bin/env python3

	# Automatic dependency installation
	import sys
	import subprocess
	import importlib

	def install_package(package):
	"""Automatically install a package if not already present"""
	try:
	importlib.import_module(package)
	except ImportError:
	print(f"{package} not found. Installing...")
	subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

	# List of required packages
	REQUIRED_PACKAGES = [
	'gradio',
	'torch',
	'transformers',
	'huggingface_hub',
	'llama-cpp-python'
	]

	# Install required packages
	for package in REQUIRED_PACKAGES:
	install_package(package)

	# Now import the installed packages
	import gradio as gr
	import torch
	from huggingface_hub import hf_hub_download
	import os

	# Efficient GGUF model download and loading
	def download_and_load_model(
	repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF",
	filename="unsloth.Q4_K_M.gguf"
	):
	"""
	Download GGUF model from HuggingFace if not exists

	Args:
	repo_id (str): HuggingFace repository ID
	filename (str): Specific GGUF model filename

	Returns:
	tuple: Loaded model and model path
	"""
	try:
	# Try to import llama-cpp directly to ensure it's available
	from llama_cpp import Llama
	except ImportError:
	print("Critical error: llama-cpp-python could not be imported.")
	sys.exit(1)

	# Determine download directory (use a cache directory)
	cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
	os.makedirs(cache_dir, exist_ok=True)

	# Check if model already exists
	model_path = os.path.join(cache_dir, filename)

	if not os.path.exists(model_path):
	print(f"Downloading {filename} from {repo_id}...")
	try:
	model_path = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	local_dir=cache_dir,
	local_dir_use_symlinks=False
	)
	except Exception as e:
	print(f"Error downloading model: {e}")
	sys.exit(1)

	print(f"Using model at: {model_path}")

	# Initialize the model with optimized CPU settings
	model = Llama(
	model_path=model_path,
	n_ctx=2048, # Context window size
	n_batch=512, # Batch size for prompt processing
	n_threads=max(torch.get_num_threads() // 2, 1), # Use half of available threads
	n_gpu_layers=-1, # Use CPU
	seed=-1, # Random seed
	verbose=True # Enable verbose logging for download confirmation
	)

	return model, model_path

	# Global model initialization
	try:
	llm_model, MODEL_PATH = download_and_load_model()
	except Exception as e:
	print(f"Fatal error initializing model: {e}")
	sys.exit(1)

	def respond(
	message,
	history,
	system_message="You are a friendly Chatbot.",
	max_tokens=512,
	temperature=0.7,
	top_p=0.95,
	):
	"""
	Generate a response using the GGUF model

	Args:
	message (str): User's input message
	history (list): Chat history
	system_message (str): System prompt
	max_tokens (int): Maximum number of tokens to generate
	temperature (float): Sampling temperature
	top_p (float): Nucleus sampling probability threshold

	Returns:
	str: Generated response
	"""
	# Prepare the full prompt with system message and history
	full_prompt = system_message + "\n\n"

	# Add chat history
	for user, assistant in history:
	if user:
	full_prompt += f"User: {user}\n"
	if assistant:
	full_prompt += f"Assistant: {assistant}\n"

	# Add current message
	full_prompt += f"User: {message}\n"
	full_prompt += "Assistant: "

	# Generate response
	try:
	# Use the most basic generation method
	response = llm_model(
	prompt=full_prompt,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens
	)['choices'][0]['text']

	return response
	except Exception as e:
	print(f"Error generating response: {e}")
	return f"An error occurred: {e}"

	# Create Gradio interface with updated configuration
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	]
	)

	if __name__ == "__main__":
	# Optional: Add some system checks
	print(f"Available CPU threads: {torch.get_num_threads()}")
	print(f"Model path: {MODEL_PATH}")

	# Launch the Gradio interface with compatible parameters
	demo.launch(
	show_api=False, # Disable API endpoint
	share=False # Do not create public URL
	)