import gradio as gr
import torch
import json
import threading
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer

# ============================================================
#  MODEL SETUP
# ============================================================
MODEL_ID = "augtoma/qCammel-13"

# 4-bit quantization (saves GPU memory)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ============================================================
#  MEMORY HANDLING
# ============================================================
MEMORY_FILE = "chat_memory.json"

# Load or initialize chat memory
if os.path.exists(MEMORY_FILE):
    with open(MEMORY_FILE, "r") as f:
        try:
            chat_memory = json.load(f)
        except json.JSONDecodeError:
            chat_memory = []
else:
    chat_memory = []

def save_memory(history):
    """Save chat history persistently."""
    with open(MEMORY_FILE, "w") as f:
        json.dump(history, f, indent=2)

# ============================================================
#  SYSTEM PROMPT (doctor personality)
# ============================================================
SYSTEM_PROMPT = (
    "You are Dr. Camel, a professional, empathetic, and helpful medical doctor. "
    "You will respond only when the patient speaks. "
    "Never start the conversation by yourself. "
    "Always reply as 'Doctor:' and never simulate the patient's responses. "
    "Your tone should be calm, supportive, and medically informative. "
    "If symptoms seem serious, politely suggest seeing a healthcare professional."
)

# ============================================================
#  CONVERSATION PROMPT BUILDER
# ============================================================
def build_conversation_prompt(history):
    """Builds a memory-aware prompt (doctor only replies after patient)."""
    conversation = SYSTEM_PROMPT + "\n\n"
    for turn in history[-6:]:
        if turn["role"] == "user":
            conversation += f"Patient: {turn['content'].strip()}\n"
        elif turn["role"] == "assistant":
            conversation += f"Doctor: {turn['content'].strip()}\n"
    conversation += "Doctor:"
    return conversation

# ============================================================
#  TEXT GENERATION (STREAMING)
# ============================================================
def generate_stream(history, max_new_tokens=512):
    prompt = build_conversation_prompt(history)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.05,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        streamer=streamer,
        pad_token_id=tokenizer.eos_token_id
    )

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    partial = ""
    for new_text in streamer:
        partial += new_text
        yield partial

# ============================================================
#  RESPONSE LOGIC
# ============================================================
def respond(user_message, history):
    if not user_message.strip():
        return gr.update(), history

    # Prevent the bot from talking first
    if len(history) == 0 and "Doctor" in user_message:
        return gr.update(), history

    history.append({"role": "user", "content": user_message})
    partial = ""

    for partial in generate_stream(history):
        yield history + [{"role": "assistant", "content": partial}], history

    history.append({"role": "assistant", "content": partial})
    save_memory(history)
    yield history, history

def clear_chat():
    global chat_memory
    chat_memory = []
    save_memory(chat_memory)
    return [], []

# ============================================================
#  GRADIO UI
# ============================================================
with gr.Blocks(title="🩺 Dr. Camel — Medical Chatbot", css=".footer {display:none;}") as demo:
    gr.Markdown(
        """
        # 🩺 Dr. Camel — AI Medical Assistant  
        Ask about your symptoms or medical concerns, and Dr. Camel will respond with care and clarity.  
        *(For demo purposes only — not real medical advice.)*
        """
    )

    chatbot = gr.Chatbot(type="messages", elem_id="chatbot", height=520, value=chat_memory)
    with gr.Row():
        txt = gr.Textbox(show_label=False, placeholder="Describe your symptoms or ask a question...", lines=2)
        clear = gr.Button("🧹 Clear Chat")

    state = gr.State(chat_memory)

    txt.submit(respond, [txt, state], [chatbot, state])
    clear.click(clear_chat, None, [chatbot, state])

    gr.Markdown(
        "### ⚠️ Disclaimer: This chatbot does not replace a real medical consultation. "
        "Always seek professional medical help for health emergencies."
    )

demo.queue()
demo.launch(server_name="0.0.0.0", server_port=7860)