from fastapi import FastAPI, HTTPException from fastapi.responses import JSONResponse from pydantic import BaseModel import torch from transformers import AutoModelForCausalLM, AutoTokenizer import uvicorn import os app = FastAPI( title="GLM-4.6-FP8 API", description="API REST funcional para GLM-4.6-FP8 com suporte a múltiplas linguagens", version="1.0.0" ) # Modelos cache model = None tokenizer = None device = "cuda" if torch.cuda.is_available() else "cpu" class ChatRequest(BaseModel): message: str max_tokens: int = 512 temperature: float = 0.7 top_p: float = 0.95 class ChatResponse(BaseModel): response: str model: str = "GLM-4.6-FP8" device: str = device @app.on_event("startup") async def startup_event(): global model, tokenizer try: print("Carregando modelo GLM-4.6-FP8...") tokenizer = AutoTokenizer.from_pretrained("zai-org/GLM-4.6-FP8") model = AutoModelForCausalLM.from_pretrained( "zai-org/GLM-4.6-FP8", device_map="auto", torch_dtype=torch.float16 if device == "cuda" else torch.float32, trust_remote_code=True ) print("Modelo carregado com sucesso!") except Exception as e: print(f"Erro ao carregar modelo: {e}") raise @app.get("/") async def root(): return { "message": "GLM-4.6-FP8 API", "version": "1.0.0", "device": device, "model_loaded": model is not None, "endpoints": { "chat": "/chat", "generate": "/generate", "health": "/health" } } @app.get("/health") async def health(): return { "status": "ok", "model_loaded": model is not None, "device": device } @app.post("/chat", response_model=ChatResponse) async def chat(request: ChatRequest): global model, tokenizer if model is None or tokenizer is None: raise HTTPException(status_code=503, detail="Modelo não está carregado") try: # Tokenizar entrada inputs = tokenizer(request.message, return_tensors="pt").to(device) # Gerar resposta with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, do_sample=True ) # Decodificar resposta response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return ChatResponse(response=response_text) except Exception as e: raise HTTPException(status_code=500, detail=f"Erro na geração: {str(e)}") @app.post("/generate", response_model=ChatResponse) async def generate(request: ChatRequest): """Alias para /chat com formato alternativo""" return await chat(request) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)