Spaces:

fariasultana
/

MiniMind-API

Runtime error

App Files Files Community

fariasultana commited on 17 days ago

Commit

9e873dd

verified ·

1 Parent(s): abae2d5

feat: Add Thinking, MDX, Templates, Tools - Enhanced capabilities

Browse files

Files changed (1) hide show

app.py +172 -185

app.py CHANGED Viewed

@@ -1,203 +1,190 @@
 """
-MiniMind Max2 - Gradio Space
-A lightweight, efficient language model with MoE architecture.
 """
-import os
-import sys
-from pathlib import Path
-# Add model files to path
-sys.path.insert(0, str(Path(__file__).parent / "model_files"))
-import torch
 import gradio as gr
 # Configuration
-MODEL_NAME = os.getenv("MODEL_NAME", "max2-nano")
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
-# Global model
-model = None
-config = None
-def load_model():
-    """Load the Max2 model."""
-    global model, config
-    from configs.model_config import get_config, estimate_params
-    from model import Max2ForCausalLM
-    print(f"🔄 Loading {MODEL_NAME} on {DEVICE}...")
-    config = get_config(MODEL_NAME)
-    model = Max2ForCausalLM(config)
-    model = model.to(device=DEVICE, dtype=DTYPE)
-    model.eval()
-    params = estimate_params(config)
-    print(f"✅ Model loaded: {params['total_params_b']:.3f}B total, {params['active_params_b']:.3f}B active")
-    return model, config
-def generate_text(prompt, max_tokens, temperature, top_k, top_p):
-    """Generate text from prompt."""
-    global model, config
-    if model is None:
-        load_model()
-    if not prompt.strip():
-        return "Please enter a prompt."
-    try:
-        # Simple character-level tokenization (demo purposes)
-        # In production, use SentencePiece or similar tokenizer
-        prompt_ids = [ord(c) % config.vocab_size for c in prompt]
-        input_ids = torch.tensor([prompt_ids], device=DEVICE)
-        with torch.no_grad():
-            output_ids = model.generate(
-                input_ids,
-                max_new_tokens=int(max_tokens),
-                temperature=temperature,
-                top_k=int(top_k),
-                top_p=top_p,
-                do_sample=True,
-            )
-        # Decode generated tokens
-        generated_ids = output_ids[0, len(prompt_ids):].tolist()
-        generated_text = "".join([chr(min(max(i, 32), 126)) for i in generated_ids])
-        return prompt + generated_text
-    except Exception as e:
-        return f"Error: {str(e)}"
 def get_model_info():
-    """Get model information."""
-    global model, config
-    if model is None:
-        load_model()
-    from configs.model_config import estimate_params
-    params = estimate_params(config)
-    return f"""
-## Model: {config.model_name}
-| Property | Value |
-|----------|-------|
-| Total Parameters | {params['total_params_b']:.3f}B |
-| Active Parameters | {params['active_params_b']:.3f}B |
-| Activation Ratio | {params['activation_ratio']:.1%} |
-| Device | {DEVICE} |
-| Num Experts | {config.num_experts} |
-| Experts per Token | {config.num_experts_per_tok} |
-| Max Context | {config.max_position_embeddings} |
 """
-# Create Gradio interface
-with gr.Blocks(title="MiniMind Max2", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🧠 MiniMind Max2
-    **Tiny Model, Powerful Experience** - An efficient language model with Mixture of Experts (MoE) architecture.
-    Only 25% of parameters are activated per token for efficient inference.
-    > ⚠️ **Note**: This demo uses character-level tokenization for simplicity.
-    > For production use, integrate a proper tokenizer (SentencePiece, etc.).
-    """)
     with gr.Tabs():
-        with gr.TabItem("🚀 Generate"):
             with gr.Row():
                 with gr.Column(scale=2):
-                    prompt_input = gr.Textbox(
-                        label="Prompt",
-                        placeholder="Enter your prompt here...",
-                        lines=4,
-                        value="Once upon a time"
-                    )
-                    with gr.Row():
-                        max_tokens = gr.Slider(
-                            minimum=10, maximum=256, value=100, step=10,
-                            label="Max New Tokens"
-                        )
-                        temperature = gr.Slider(
-                            minimum=0.1, maximum=2.0, value=0.8, step=0.1,
-                            label="Temperature"
-                        )
                     with gr.Row():
-                        top_k = gr.Slider(
-                            minimum=1, maximum=100, value=50, step=1,
-                            label="Top-K"
-                        )
-                        top_p = gr.Slider(
-                            minimum=0.1, maximum=1.0, value=0.9, step=0.05,
-                            label="Top-P"
-                        )
-                    generate_btn = gr.Button("🎯 Generate", variant="primary")
-                with gr.Column(scale=2):
-                    output_text = gr.Textbox(
-                        label="Generated Text",
-                        lines=12,
-                        show_copy_button=True
-                    )
-            generate_btn.click(
-                fn=generate_text,
-                inputs=[prompt_input, max_tokens, temperature, top_k, top_p],
-                outputs=output_text
-            )
-            gr.Examples(
-                examples=[
-                    ["Once upon a time", 100, 0.8, 50, 0.9],
-                    ["The quick brown fox", 50, 0.7, 40, 0.95],
-                    ["In a galaxy far away", 150, 1.0, 60, 0.85],
-                    ["def fibonacci(n):", 80, 0.6, 30, 0.9],
-                ],
-                inputs=[prompt_input, max_tokens, temperature, top_k, top_p],
-            )
-        with gr.TabItem("ℹ️ Model Info"):
-            info_btn = gr.Button("📊 Load Model Info")
-            info_output = gr.Markdown()
-            info_btn.click(fn=get_model_info, outputs=info_output)
-    gr.Markdown("""
-    ---
-    ### 🔧 Architecture
-    - **MoE**: 8 experts, top-2 routing (25% activation)
-    - **GQA**: Grouped Query Attention (4:1 ratio)
-    - **RoPE**: Rotary Position Embeddings
-    - **SwiGLU**: Improved activation function
-    ### 📦 Model Variants
-    | Model | Total | Active | Target |
-    |-------|-------|--------|--------|
-    | max2-nano | 500M | 125M | IoT/Mobile |
-    | max2-lite | 1.5B | 375M | Mobile/Tablet |
-    | max2-pro | 3B | 750M | Desktop |
-    ---
-    **[Model Repository](https://huggingface.co/fariasultana/MiniMind)** |
-    **License**: Apache 2.0
-    """)
-# Load model on startup
-try:
-    load_model()
-except Exception as e:
-    print(f"Model will load on first request: {e}")
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 """
+MiniMind Max2 API - Enhanced with Thinking, Vision, and Agentic Capabilities
+HuggingFace Spaces Gradio Application
 """
 import gradio as gr
+import json
+import time
+from typing import Dict, Any, List, Optional, Tuple
+from dataclasses import dataclass
+from enum import Enum
+# ============================================================================
 # Configuration
+# ============================================================================
+@dataclass
+class ModelConfig:
+    """Model configuration."""
+    hidden_size: int = 1024
+    num_layers: int = 12
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 4
+    intermediate_size: int = 2816
+    vocab_size: int = 102400
+    num_experts: int = 8
+    num_experts_per_token: int = 2
+    max_seq_length: int = 32768
+class ThinkingMode(Enum):
+    """Thinking modes."""
+    INTERLEAVED = "interleaved"
+    SEQUENTIAL = "sequential"
+    HIDDEN = "hidden"
+# ============================================================================
+# Thinking Engine
+# ============================================================================
+class ThinkingEngine:
+    """Simulated thinking engine for demonstration."""
+    def __init__(self):
+        self.config = {
+            "think_start": "<Thinking>",
+            "think_end": "</Thinking>",
+            "step_marker": "<step>",
+            "reflect_marker": "<reflect>",
+            "conclude_marker": "<conclude>",
+        }
+    def think(self, query: str, mode: ThinkingMode = ThinkingMode.INTERLEAVED, show_thinking: bool = True) -> Dict[str, Any]:
+        """Generate response with thinking trace."""
+        steps = [
+            {"type": "reasoning", "content": f"Analyzing: '{query[:50]}...'", "confidence": 0.95},
+            {"type": "planning", "content": "Planning approach with MoE routing...", "confidence": 0.90},
+            {"type": "generation", "content": "Generating with 25% active parameters.", "confidence": 0.92},
+            {"type": "reflection", "content": "Verifying response quality.", "confidence": 0.88},
+        ]
+        thinking_trace = self._format_thinking(steps) if show_thinking else None
+        response = self._generate_response(query)
+        return {"response": response, "thinking": thinking_trace, "steps": steps, "mode": mode.value}
+    def _format_thinking(self, steps: List[Dict]) -> str:
+        cfg = self.config
+        lines = [cfg["think_start"]]
+        for i, step in enumerate(steps):
+            marker = cfg["step_marker"] if step["type"] != "reflection" else cfg["reflect_marker"]
+            lines.append(f"{marker} Step {i+1} ({step['type']}): {step['content']}")
+            lines.append(f"  Confidence: {step['confidence']:.0%}")
+        lines.append(cfg["conclude_marker"] + " Formulating final response...")
+        lines.append(cfg["think_end"])
+        return "\n".join(lines)
+    def _generate_response(self, query: str) -> str:
+        responses = {
+            "hello": "Hello! I'm MiniMind Max2, an efficient edge-deployed language model. How can I help?",
+            "help": "I can help with text generation, code assistance, reasoning, function calling, and more!",
+        }
+        query_lower = query.lower()
+        for key, response in responses.items():
+            if key in query_lower:
+                return response
+        return f"Processing your query with MoE architecture (8 experts, top-2 routing):\n\n{query}\n\nResponse generated with 25% active parameters for maximum efficiency."
+# ============================================================================
+# MDX & Templates
+# ============================================================================
+class MDXRenderer:
+    @staticmethod
+    def linear_process_flow(steps: List[Dict]) -> str:
+        html = '<div style="display:flex;gap:10px;flex-wrap:wrap;">'
+        for i, step in enumerate(steps):
+            html += f'<div style="background:#e3f2fd;padding:10px;border-radius:8px;"><b>{i+1}.</b> {step.get("title", "Step")}<br><small>{step.get("description", "")}</small></div>'
+            if i < len(steps)-1:
+                html += '<div style="font-size:20px;color:#1976d2;">→</div>'
+        html += '</div>'
+        return html
+class ToolRegistry:
+    TOOLS = {
+        "search": {"description": "Search the web"},
+        "calculate": {"description": "Math calculations"},
+        "code_execute": {"description": "Execute Python code"},
+    }
+    @classmethod
+    def execute(cls, tool: str, **kwargs) -> str:
+        if tool == "calculate":
+            try:
+                return f"Result: {eval(kwargs.get('expression', '0'), {'__builtins__': {}}, {})}"
+            except:
+                return "Error"
+        return f"Executed {tool}"
+# Initialize
+thinking_engine = ThinkingEngine()
+def respond(message, history, mode, show, temp, max_tok):
+    result = thinking_engine.think(message, ThinkingMode(mode.lower()), show)
+    history.append([message, result["response"]])
+    return history, "", result.get("thinking", "Hidden")
 def get_model_info():
+    return """
+# MiniMind Max2
+## Architecture
+- **MoE**: 8 experts, top-2 routing (25% activation)
+- **GQA**: 16 Q-heads, 4 KV-heads (4x memory reduction)
+- **Hidden Size**: 1024 | **Layers**: 12 | **Vocab**: 102,400
+## Capabilities
+- Chain-of-Thought Reasoning
+- Vision Adapter (SigLIP)
+- Function Calling
+- Fill-in-the-Middle Coding
+- Speculative Decoding
+- NPU Export (TFLite/QNN)
 """
+# Gradio UI
+with gr.Blocks(title="MiniMind Max2", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧠 MiniMind Max2 API\n### Efficient Edge AI with Interleaved Thinking")
     with gr.Tabs():
+        with gr.Tab("💬 Chat"):
             with gr.Row():
                 with gr.Column(scale=2):
+                    chatbot = gr.Chatbot(height=400)
+                    msg = gr.Textbox(placeholder="Ask anything...")
                     with gr.Row():
+                        submit = gr.Button("Send", variant="primary")
+                        clear = gr.Button("Clear")
+                with gr.Column(scale=1):
+                    mode = gr.Radio(["Interleaved", "Sequential", "Hidden"], value="Interleaved", label="Thinking Mode")
+                    show = gr.Checkbox(label="Show Thinking", value=True)
+                    temp = gr.Slider(0, 1, 0.7, label="Temperature")
+                    tokens = gr.Slider(50, 2000, 500, label="Max Tokens")
+                    thinking = gr.Textbox(label="Thinking Trace", lines=8)
+            submit.click(respond, [msg, chatbot, mode, show, temp, tokens], [chatbot, msg, thinking])
+            msg.submit(respond, [msg, chatbot, mode, show, temp, tokens], [chatbot, msg, thinking])
+            clear.click(lambda: ([], "", ""), outputs=[chatbot, msg, thinking])
+        with gr.Tab("🔧 Tools"):
+            gr.Markdown("### Function Calling")
+            tool = gr.Dropdown(["calculate", "search", "code_execute"], value="calculate", label="Tool")
+            inp = gr.Textbox(value="2 + 2 * 3", label="Input")
+            btn = gr.Button("Execute", variant="primary")
+            out = gr.Textbox(label="Result")
+            btn.click(lambda t, i: ToolRegistry.execute(t, expression=i, query=i, code=i), [tool, inp], out)
+        with gr.Tab("ℹ️ Info"):
+            gr.Markdown(get_model_info())
+    gr.Markdown("---\n[Model](https://huggingface.co/fariasultana/MiniMind) | Apache 2.0")
 if __name__ == "__main__":
+    demo.launch()