#!/usr/bin/env python3 # Automatic dependency installation import sys import subprocess import importlib def install_package(package): """Automatically install a package if not already present""" try: importlib.import_module(package) except ImportError: print(f"{package} not found. Installing...") subprocess.check_call([sys.executable, '-m', 'pip', 'install', package]) # List of required packages REQUIRED_PACKAGES = [ 'gradio', 'torch', 'transformers', 'huggingface_hub', 'llama-cpp-python' ] # Install required packages for package in REQUIRED_PACKAGES: install_package(package) # Now import the installed packages import gradio as gr import torch from huggingface_hub import hf_hub_download import os # Efficient GGUF model download and loading def download_and_load_model( repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF", filename="unsloth.Q4_K_M.gguf" ): """ Download GGUF model from HuggingFace if not exists Args: repo_id (str): HuggingFace repository ID filename (str): Specific GGUF model filename Returns: tuple: Loaded model and model path """ try: # Try to import llama-cpp directly to ensure it's available from llama_cpp import Llama except ImportError: print("Critical error: llama-cpp-python could not be imported.") sys.exit(1) # Determine download directory (use a cache directory) cache_dir = os.path.expanduser("~/.cache/huggingface/hub") os.makedirs(cache_dir, exist_ok=True) # Check if model already exists model_path = os.path.join(cache_dir, filename) if not os.path.exists(model_path): print(f"Downloading {filename} from {repo_id}...") try: model_path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir=cache_dir, local_dir_use_symlinks=False ) except Exception as e: print(f"Error downloading model: {e}") sys.exit(1) print(f"Using model at: {model_path}") # Initialize the model with optimized CPU settings model = Llama( model_path=model_path, n_ctx=2048, # Context window size n_batch=512, # Batch size for prompt processing n_threads=max(torch.get_num_threads() // 2, 1), # Use half of available threads n_gpu_layers=-1, # Use CPU seed=-1, # Random seed verbose=True # Enable verbose logging for download confirmation ) return model, model_path # Global model initialization try: llm_model, MODEL_PATH = download_and_load_model() except Exception as e: print(f"Fatal error initializing model: {e}") sys.exit(1) def respond( message, history, system_message="You are a friendly Chatbot.", max_tokens=512, temperature=0.7, top_p=0.95, ): """ Generate a response using the GGUF model Args: message (str): User's input message history (list): Chat history system_message (str): System prompt max_tokens (int): Maximum number of tokens to generate temperature (float): Sampling temperature top_p (float): Nucleus sampling probability threshold Returns: str: Generated response """ # Prepare the full prompt with system message and history full_prompt = system_message + "\n\n" # Add chat history for user, assistant in history: if user: full_prompt += f"User: {user}\n" if assistant: full_prompt += f"Assistant: {assistant}\n" # Add current message full_prompt += f"User: {message}\n" full_prompt += "Assistant: " # Generate response try: # Use the most basic generation method response = llm_model( prompt=full_prompt, temperature=temperature, top_p=top_p, max_tokens=max_tokens )['choices'][0]['text'] return response except Exception as e: print(f"Error generating response: {e}") return f"An error occurred: {e}" # Create Gradio interface with updated configuration demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ] ) if __name__ == "__main__": # Optional: Add some system checks print(f"Available CPU threads: {torch.get_num_threads()}") print(f"Model path: {MODEL_PATH}") # Launch the Gradio interface with compatible parameters demo.launch( show_api=False, # Disable API endpoint share=False # Do not create public URL )