Spaces:

malasya
/

GOXY

Sleeping

App Files Files Community

lasagnakanada commited on Nov 24, 2025

Commit

d6282d0

1 Parent(s): 1ead44d

fix(llm): retry on 429 with backoff; raise LLMProviderRateLimit; surface as HTTP 429 at endpoint; add tests

Browse files

Files changed (5) hide show

app/api/v1/endpoints/core.py +11 -0
app/core/exceptions.py +14 -0
app/core/services/generation_service.py +4 -0
app/core/services/llm_service.py +93 -16
tests/test_llm_rate_limit.py +45 -0

app/api/v1/endpoints/core.py CHANGED Viewed

@@ -9,6 +9,7 @@ from uuid import UUID
 import structlog
 from fastapi import APIRouter, Depends, HTTPException, status
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.core.services import (
@@ -198,6 +199,16 @@ async def moderate_text(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail={"error": "ValidationError", "message": str(e)},
         )
     except Exception as e:
         logger.error("moderate_request_failed", error=str(e), exc_info=True)
         raise HTTPException(

 import structlog
 from fastapi import APIRouter, Depends, HTTPException, status
+from app.core.exceptions import LLMProviderRateLimit
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.core.services import (
             status_code=status.HTTP_400_BAD_REQUEST,
             detail={"error": "ValidationError", "message": str(e)},
         )
+    except LLMProviderRateLimit as e:
+        logger.warning("generate_request_provider_rate_limited", error=str(e))
+        raise HTTPException(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            detail={
+                "error": "UpstreamProviderRateLimited",
+                "message": str(e),
+                "provider_details": getattr(e, "details", {}),
+            },
+        )
     except Exception as e:
         logger.error("moderate_request_failed", error=str(e), exc_info=True)
         raise HTTPException(

app/core/exceptions.py ADDED Viewed

	@@ -0,0 +1,14 @@

+class LLMProviderError(Exception):
+    """Base exception for upstream LLM provider failures."""
+class LLMProviderRateLimit(LLMProviderError):
+    """Raised when upstream LLM provider is rate limited or out of credits."""
+    def __init__(
+        self,
+        message: str = "Upstream LLM provider rate limit or exhausted credits",
+        details: dict | None = None,
+    ):
+        super().__init__(message)
+        self.details = details or {}

app/core/services/generation_service.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Dict, List, Optional
 import structlog
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import settings
 from app.core.services.llm_service import get_llm_service
 from app.core.services.moderation_service import get_moderation_service
@@ -182,6 +183,9 @@ class GenerationService:
             }
         except Exception as e:
             if self.session is not None:
                 await self.session.rollback()
             logger.error(

 import structlog
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import settings
+from app.core.exceptions import LLMProviderRateLimit
 from app.core.services.llm_service import get_llm_service
 from app.core.services.moderation_service import get_moderation_service
             }
         except Exception as e:
+            # propagate upstream provider rate-limit errors so API layer can return 429
+            if isinstance(e, LLMProviderRateLimit):
+                raise
             if self.session is not None:
                 await self.session.rollback()
             logger.error(

app/core/services/llm_service.py CHANGED Viewed

@@ -11,7 +11,10 @@ import os
 from typing import Dict, List, Optional
 import structlog
-from httpx import AsyncClient
 from app.config import get_settings, settings
 from app.schemas.core import ChatMessage, TaskType
@@ -413,22 +416,96 @@ class GrokLLMService:
             "max_tokens": max_length,
         }
-        async with AsyncClient(timeout=settings.model_inference_timeout) as client:
-            resp = await client.post(
-                f"{self.base_url}/chat/completions", json=payload, headers=headers
-            )
-            if resp.status_code >= 400:
-                # Log full response text to aid debugging (xAI returns useful error JSON)
                 try:
-                    err_json = resp.json()
-                except Exception:
-                    err_json = {"raw": resp.text}
-                logger.error(
-                    "grok_api_error",
-                    status_code=resp.status_code,
-                    error=err_json,
-                )
-                resp.raise_for_status()
             data = resp.json()
             # Assuming OpenAI-like response structure
             text = data["choices"][0]["message"]["content"].strip()

 from typing import Dict, List, Optional
 import structlog
+from httpx import AsyncClient, Timeout, ReadTimeout, RequestError
+import asyncio
+from app.core.exceptions import LLMProviderRateLimit
 from app.config import get_settings, settings
 from app.schemas.core import ChatMessage, TaskType
             "max_tokens": max_length,
         }
+        # Use an explicit Timeout so we control connect vs read timeouts.
+        # model_inference_timeout is an integer (seconds) from settings.
+        # httpx.Timeout requires either a `default` or all four values set.
+        # Set connect/read/write/pool explicitly to avoid the library error.
+        read_timeout = float(settings.model_inference_timeout)
+        timeout = Timeout(connect=5.0, read=read_timeout, write=read_timeout, pool=5.0)
+        max_attempts = int(getattr(settings, "grok_retry_attempts", 3))
+        backoff_base = float(getattr(settings, "grok_retry_backoff_base", 1.0))
+        logger.info(
+            "grok_request_payload",
+            base_url=self.base_url,
+            endpoint="/chat/completions",
+            model=self.model_name,
+            messages=[
+                {
+                    "role": m["role"],
+                    "content": (
+                        m["content"][:50] + "..." if len(m["content"]) > 50 else m["content"]
+                    ),
+                }
+                for m in messages
+            ],
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_length,
+        )
+        async with AsyncClient(timeout=timeout) as client:
+            for attempt in range(1, max_attempts + 1):
                 try:
+                    resp = await client.post(
+                        f"{self.base_url}/chat/completions", json=payload, headers=headers
+                    )
+                    logger.info(
+                        "grok_response_received",
+                        status_code=resp.status_code,
+                        headers=dict(resp.headers),
+                    )
+                except ReadTimeout:
+                    logger.error(
+                        "grok_request_timeout",
+                        base_url=self.base_url,
+                        timeout_seconds=settings.model_inference_timeout,
+                    )
+                    raise
+                except RequestError as exc:
+                    logger.error(
+                        "grok_request_error",
+                        base_url=self.base_url,
+                        error=str(exc),
+                    )
+                    raise
+                if resp.status_code == 429:
+                    # rate limit — retry with exponential backoff up to max_attempts
+                    try:
+                        err_json = resp.json()
+                    except Exception:
+                        err_json = {"raw": resp.text}
+                    logger.warning(
+                        "grok_api_rate_limited",
+                        attempt=attempt,
+                        max_attempts=max_attempts,
+                        error=err_json,
+                    )
+                    if attempt < max_attempts:
+                        sleep_seconds = backoff_base * (2 ** (attempt - 1))
+                        await asyncio.sleep(sleep_seconds)
+                        continue
+                    else:
+                        logger.error("grok_api_rate_limit_exhausted", error=err_json)
+                        raise LLMProviderRateLimit(
+                            "Upstream LLM provider rate limit or exhausted credits",
+                            details=err_json,
+                        )
+                if resp.status_code >= 400:
+                    try:
+                        err_json = resp.json()
+                    except Exception:
+                        err_json = {"raw": resp.text}
+                    logger.error(
+                        "grok_api_error",
+                        status_code=resp.status_code,
+                        error=err_json,
+                    )
+                    resp.raise_for_status()
             data = resp.json()
             # Assuming OpenAI-like response structure
             text = data["choices"][0]["message"]["content"].strip()

tests/test_llm_rate_limit.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import pytest
+from app.core.services.llm_service import GrokLLMService
+from app.core.exceptions import LLMProviderRateLimit
+class Fake429Resp:
+    def __init__(self, text="rate limit", payload=None):
+        self.status_code = 429
+        self.text = text
+        self._payload = payload or {
+            "code": "Some resource has been exhausted",
+            "error": "out of credits",
+        }
+    def json(self):
+        return self._payload
+class FakeAsyncClient:
+    def __init__(self, *args, **kwargs):
+        self._calls = 0
+    async def __aenter__(self):
+        return self
+    async def __aexit__(self, exc_type, exc, tb):
+        return False
+    async def post(self, *args, **kwargs):
+        self._calls += 1
+        return Fake429Resp()
+@pytest.mark.asyncio
+async def test_grok_service_raises_rate_limit(monkeypatch):
+    monkeypatch.setattr("app.core.services.llm_service.AsyncClient", FakeAsyncClient)
+    svc = GrokLLMService()
+    svc.api_key = "fake"
+    svc.base_url = "https://fake"
+    svc.model_name = "grok-test"
+    with pytest.raises(LLMProviderRateLimit):
+        await svc.generate("hi there")