Initial commit: migrate intelligence-router files
This commit is contained in:
commit
0e05390be2
13
Dockerfile
Normal file
13
Dockerfile
Normal file
@ -0,0 +1,13 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY main.py .
|
||||
|
||||
# Expose the proxy port
|
||||
EXPOSE 9000
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "9000"]
|
||||
28
docker-compose.yml
Normal file
28
docker-compose.yml
Normal file
@ -0,0 +1,28 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# The Intelligence Router
|
||||
router:
|
||||
build: ./intelligence-router
|
||||
ports:
|
||||
- "9000:9000"
|
||||
environment:
|
||||
- MAIN_PC_URL=http://10.0.4.x:8080/v1
|
||||
- LOCAL_SLM_URL=http://llama-slm:8080/v1
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
depends_on:
|
||||
- llama-slm
|
||||
|
||||
# The Local SLM (Fallback Brain)
|
||||
llama-slm:
|
||||
image: ghcr.io/ggerganov/llama.cpp:server
|
||||
volumes:
|
||||
- ./models:/models
|
||||
# Command to run a small, fast model (e.g., Llama-3-8B GGUF)
|
||||
command: >
|
||||
-m /models/llama-3-8b-instruct.Q4_K_M.gguf
|
||||
--host 0.0.0.0
|
||||
--port 8080
|
||||
--ctx-size 2048
|
||||
ports:
|
||||
- "8081:8080"
|
||||
101
main.py
Normal file
101
main.py
Normal file
@ -0,0 +1,101 @@
|
||||
import os
|
||||
import asyncio
|
||||
import httpx
|
||||
from fastapi import FastAPI, Request, Response, Header
|
||||
from fastapi.responses import StreamingResponse
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Configuration from environment variables
|
||||
MAIN_PC_URL = os.getenv("MAIN_PC_URL", "http://10.0.4.x:8080/v1")
|
||||
LOCAL_SLM_URL = os.getenv("LOCAL_SLM_URL", "http://llama-slm:8080/v1")
|
||||
OPENAI_URL = "https://api.openai.com/v1"
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
|
||||
# Health check endpoint for the Main PC
|
||||
async def check_main_pc_health():
|
||||
try:
|
||||
# We check a simple endpoint or just attempt a connection to the base URL
|
||||
async with httpx.AsyncClient(timeout=2.0) as client:
|
||||
response = await client.get(f"{MAIN_PC_URL}/models")
|
||||
return response.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
|
||||
async def proxy(
|
||||
request: Request,
|
||||
path: str,
|
||||
x_intelligence_level: str = Header(None)
|
||||
):
|
||||
"""
|
||||
Smart Proxy: Routes requests based on target availability and intelligence requirements.
|
||||
"""
|
||||
target_url = None
|
||||
|
||||
# 1. Check for "Turbo" (High Intelligence) request
|
||||
if x_intelligence_level == "High" and OPENAI_API_KEY:
|
||||
target_url = f"{OPENAI_URL}/{path}"
|
||||
|
||||
# 2. Try Primary (Main PC)
|
||||
else:
|
||||
is_main_pc_online = await check_main_pc_health()
|
||||
if is_main_pc_online:
|
||||
target_url = f"{MAIN_PC_URL}/{path}"
|
||||
else:
|
||||
# 3. Fallback to Local SLM (on Docker host)
|
||||
target_url = f"{LOCAL_SLM_URL}/{path}"
|
||||
|
||||
if not target_url:
|
||||
return Response(content="No valid target available (Main PC offline, SLM unavailable, and no OpenAI key)", status_code=503)
|
||||
|
||||
# Prepare request for proxying
|
||||
body = await request.body()
|
||||
headers = dict(request.headers)
|
||||
|
||||
# Update headers for the target
|
||||
headers.pop("host", None)
|
||||
headers.pop("content-length", None)
|
||||
if target_url.startswith("https://api.openai.com"):
|
||||
headers["Authorization"] = f"Bearer {OPENAI_API_KEY}"
|
||||
|
||||
# Execute the request
|
||||
async def stream_generator():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
async with client.stream(
|
||||
request.method,
|
||||
target_url,
|
||||
content=body,
|
||||
headers=headers,
|
||||
) as resp:
|
||||
async for chunk in resp.aiter_bytes():
|
||||
yield chunk
|
||||
|
||||
# Handle streaming responses (essential for LLM)
|
||||
accept_header = request.headers.get("accept", "")
|
||||
if "text/event-stream" in accept_header or "application/x-ndjson" in accept_header:
|
||||
return StreamingResponse(stream_generator(), status_code=200, background=None)
|
||||
|
||||
# For non-streaming, we'll just use a simple proxy logic
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
try:
|
||||
resp = await client.request(
|
||||
method=request.method,
|
||||
url=target_url,
|
||||
content=body,
|
||||
headers=headers,
|
||||
)
|
||||
return Response(
|
||||
content=resp.content,
|
||||
status_code=resp.status_code,
|
||||
headers=dict(resp.headers)
|
||||
)
|
||||
except Exception as e:
|
||||
return Response(content=str(e), status_code=500)
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "router_online"}
|
||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
httpx
|
||||
python-dotenv
|
||||
Loading…
Reference in New Issue
Block a user