From 0e05390be27a47ac12c6b4a174ddf7e3db8b7d01 Mon Sep 17 00:00:00 2001
From: Tudorel Oprisan <tzeusd@gmail.com>
Date: Tue, 9 Jun 2026 11:48:43 +0100
Subject: [PATCH] Initial commit: migrate intelligence-router files

---
 Dockerfile         |  13 ++++++
 docker-compose.yml |  28 +++++++++++++
 main.py            | 101 +++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt   |   4 ++
 4 files changed, 146 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml
 create mode 100644 main.py
 create mode 100644 requirements.txt

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..c75d205
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY main.py .
+
+# Expose the proxy port
+EXPOSE 9000
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "9000"]
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..cc08e57
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,28 @@
+version: '3.8'
+
+services:
+  # The Intelligence Router
+  router:
+    build: ./intelligence-router
+    ports:
+      - "9000:9000"
+    environment:
+      - MAIN_PC_URL=http://10.0.4.x:8080/v1
+      - LOCAL_SLM_URL=http://llama-slm:8080/v1
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+    depends_on:
+      - llama-slm
+
+  # The Local SLM (Fallback Brain)
+  llama-slm:
+    image: ghcr.io/ggerganov/llama.cpp:server
+    volumes:
+      - ./models:/models
+    # Command to run a small, fast model (e.g., Llama-3-8B GGUF)
+    command: >
+      -m /models/llama-3-8b-instruct.Q4_K_M.gguf
+      --host 0.0.0.0
+      --port 8080
+      --ctx-size 2048
+    ports:
+      - "8081:8080"
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..2bc8b38
--- /dev/null
+++ b/main.py
@@ -0,0 +1,101 @@
+import os
+import asyncio
+import httpx
+from fastapi import FastAPI, Request, Response, Header
+from fastapi.responses import StreamingResponse
+from dotenv import load_dotenv
+
+load_dotenv()
+
+app = FastAPI()
+
+# Configuration from environment variables
+MAIN_PC_URL = os.getenv("MAIN_PC_URL", "http://10.0.4.x:8080/v1")
+LOCAL_SLM_URL = os.getenv("LOCAL_SLM_URL", "http://llama-slm:8080/v1")
+OPENAI_URL = "https://api.openai.com/v1"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+
+# Health check endpoint for the Main PC
+async def check_main_pc_health():
+    try:
+        # We check a simple endpoint or just attempt a connection to the base URL
+        async with httpx.AsyncClient(timeout=2.0) as client:
+            response = await client.get(f"{MAIN_PC_URL}/models")
+            return response.status_code == 200
+    except Exception:
+        return False
+
+@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
+async def proxy(
+    request: Request, 
+    path: str, 
+    x_intelligence_level: str = Header(None)
+):
+    """
+    Smart Proxy: Routes requests based on target availability and intelligence requirements.
+    """
+    target_url = None
+    
+    # 1. Check for "Turbo" (High Intelligence) request
+    if x_intelligence_level == "High" and OPENAI_API_KEY:
+        target_url = f"{OPENAI_URL}/{path}"
+    
+    # 2. Try Primary (Main PC)
+    else:
+        is_main_pc_online = await check_main_pc_health()
+        if is_main_pc_online:
+            target_url = f"{MAIN_PC_URL}/{path}"
+        else:
+            # 3. Fallback to Local SLM (on Docker host)
+            target_url = f"{LOCAL_SLM_URL}/{path}"
+
+    if not target_url:
+        return Response(content="No valid target available (Main PC offline, SLM unavailable, and no OpenAI key)", status_code=503)
+
+    # Prepare request for proxying
+    body = await request.body()
+    headers = dict(request.headers)
+    
+    # Update headers for the target
+    headers.pop("host", None)
+    headers.pop("content-length", None)
+    if target_url.startswith("https://api.openai.com"):
+        headers["Authorization"] = f"Bearer {OPENAI_API_KEY}"
+
+    # Execute the request
+    async def stream_generator():
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            async with client.stream(
+                request.method,
+                target_url,
+                content=body,
+                headers=headers,
+            ) as resp:
+                async for chunk in resp.aiter_bytes():
+                    yield chunk
+
+    # Handle streaming responses (essential for LLM)
+    accept_header = request.headers.get("accept", "")
+    if "text/event-stream" in accept_header or "application/x-ndjson" in accept_header:
+        return StreamingResponse(stream_generator(), status_code=200, background=None)
+    
+    # For non-streaming, we'll just use a simple proxy logic
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        try:
+            resp = await client.request(
+                method=request.method,
+                url=target_url,
+                content=body,
+                headers=headers,
+            )
+            return Response(
+                content=resp.content,
+                status_code=resp.status_code,
+                headers=dict(resp.headers)
+            )
+        except Exception as e:
+            return Response(content=str(e), status_code=500)
+
+@app.get("/health")
+async def health():
+    return {"status": "router_online"}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3263361
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+fastapi
+uvicorn
+httpx
+python-dotenv