Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # Automatic dependency installation | |
| import sys | |
| import subprocess | |
| import importlib | |
| def install_package(package): | |
| """Automatically install a package if not already present""" | |
| try: | |
| importlib.import_module(package) | |
| except ImportError: | |
| print(f"{package} not found. Installing...") | |
| subprocess.check_call([sys.executable, '-m', 'pip', 'install', package]) | |
| # List of required packages | |
| REQUIRED_PACKAGES = [ | |
| 'gradio', | |
| 'torch', | |
| 'transformers', | |
| 'huggingface_hub', | |
| 'llama-cpp-python' | |
| ] | |
| # Install required packages | |
| for package in REQUIRED_PACKAGES: | |
| install_package(package) | |
| # Now import the installed packages | |
| import gradio as gr | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| # Efficient GGUF model download and loading | |
| def download_and_load_model( | |
| repo_id="N-Bot-Int/OpenElla3-Llama3.2B-GGUF", | |
| filename="unsloth.Q4_K_M.gguf" | |
| ): | |
| """ | |
| Download GGUF model from HuggingFace if not exists | |
| Args: | |
| repo_id (str): HuggingFace repository ID | |
| filename (str): Specific GGUF model filename | |
| Returns: | |
| tuple: Loaded model and model path | |
| """ | |
| try: | |
| # Try to import llama-cpp directly to ensure it's available | |
| from llama_cpp import Llama | |
| except ImportError: | |
| print("Critical error: llama-cpp-python could not be imported.") | |
| sys.exit(1) | |
| # Determine download directory (use a cache directory) | |
| cache_dir = os.path.expanduser("~/.cache/huggingface/hub") | |
| os.makedirs(cache_dir, exist_ok=True) | |
| # Check if model already exists | |
| model_path = os.path.join(cache_dir, filename) | |
| if not os.path.exists(model_path): | |
| print(f"Downloading {filename} from {repo_id}...") | |
| try: | |
| model_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| local_dir=cache_dir, | |
| local_dir_use_symlinks=False | |
| ) | |
| except Exception as e: | |
| print(f"Error downloading model: {e}") | |
| sys.exit(1) | |
| print(f"Using model at: {model_path}") | |
| # Initialize the model with optimized CPU settings | |
| model = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, # Context window size | |
| n_batch=512, # Batch size for prompt processing | |
| n_threads=max(torch.get_num_threads() // 2, 1), # Use half of available threads | |
| n_gpu_layers=-1, # Use CPU | |
| seed=-1, # Random seed | |
| verbose=True # Enable verbose logging for download confirmation | |
| ) | |
| return model, model_path | |
| # Global model initialization | |
| try: | |
| llm_model, MODEL_PATH = download_and_load_model() | |
| except Exception as e: | |
| print(f"Fatal error initializing model: {e}") | |
| sys.exit(1) | |
| def respond( | |
| message, | |
| history, | |
| system_message="You are a friendly Chatbot.", | |
| max_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95, | |
| ): | |
| """ | |
| Generate a response using the GGUF model | |
| Args: | |
| message (str): User's input message | |
| history (list): Chat history | |
| system_message (str): System prompt | |
| max_tokens (int): Maximum number of tokens to generate | |
| temperature (float): Sampling temperature | |
| top_p (float): Nucleus sampling probability threshold | |
| Returns: | |
| str: Generated response | |
| """ | |
| # Prepare the full prompt with system message and history | |
| full_prompt = system_message + "\n\n" | |
| # Add chat history | |
| for user, assistant in history: | |
| if user: | |
| full_prompt += f"User: {user}\n" | |
| if assistant: | |
| full_prompt += f"Assistant: {assistant}\n" | |
| # Add current message | |
| full_prompt += f"User: {message}\n" | |
| full_prompt += "Assistant: " | |
| # Generate response | |
| try: | |
| # Use the most basic generation method | |
| response = llm_model( | |
| prompt=full_prompt, | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_tokens=max_tokens | |
| )['choices'][0]['text'] | |
| return response | |
| except Exception as e: | |
| print(f"Error generating response: {e}") | |
| return f"An error occurred: {e}" | |
| # Create Gradio interface with updated configuration | |
| demo = gr.ChatInterface( | |
| respond, | |
| additional_inputs=[ | |
| gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
| gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ), | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| # Optional: Add some system checks | |
| print(f"Available CPU threads: {torch.get_num_threads()}") | |
| print(f"Model path: {MODEL_PATH}") | |
| # Launch the Gradio interface with compatible parameters | |
| demo.launch( | |
| show_api=False, # Disable API endpoint | |
| share=False # Do not create public URL | |
| ) |