# Unified Dockerfile for Hugging Face Spaces and Koyeb # Uses vLLM for high-performance inference on both platforms # The start-vllm.sh script auto-detects the deployment environment FROM vllm/vllm-openai:latest # Cache bust: Force rebuild on each push (prevents HF Spaces from using cached old image) # Harmless for Koyeb (unused ARG) ARG CACHE_BUST RUN echo "Build timestamp: ${CACHE_BUST:-$(date +%s)}" # Environment variables ENV HF_HOME=/tmp/huggingface \ VLLM_ATTENTION_BACKEND=FLASH_ATTN # Create cache directories RUN mkdir -p /tmp/huggingface && chmod 777 /tmp/huggingface # Install observability dependencies # Note: vLLM base image already includes most dependencies RUN pip install --no-cache-dir \ langfuse>=2.50.0 \ logfire>=0.0.1 # Copy startup script (handles both HF Spaces and Koyeb) COPY start-vllm.sh /start-vllm.sh RUN chmod +x /start-vllm.sh # Expose ports: 8000 (default/Koyeb) and 7860 (HF Spaces) # The start-vllm.sh script auto-detects the environment and uses the correct port EXPOSE 8000 7860 # Use ENTRYPOINT so it can't be overridden ENTRYPOINT ["/start-vllm.sh"]