intelligence-router/sidecar/app.py

"""Sidecar FastAPI service — Issue #2 foundation.

Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
"""
import os
import asyncio
import signal as signal_module
import threading
from contextlib import asynccontextmanager
from typing import Optional

from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel

from sidecar.manifest import load_manifest

# Configuration from environment
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8081"))
LLAMA_SERVER_PORT = 8080

# Global state
_llama_server_process: Optional[asyncio.subprocess.Process] = None
_active_profile: Optional[str] = None
_switch_lock = threading.Lock()  # Use threading.Lock for compatibility with TestClient


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Manage sidecar lifecycle — no default model loaded."""
    print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}")
    yield
    # Cleanup: kill llama-server if running
    global _llama_server_process
    if _llama_server_process:
        _kill_llama_server()


app = FastAPI(lifespan=lifespan)


def _kill_llama_server():
    """Kill the llama-server subprocess."""
    global _llama_server_process
    if _llama_server_process and _llama_server_process.returncode is None:
        try:
            _llama_server_process.send_signal(signal_module.SIGTERM)
            try:
                _llama_server_process.wait(timeout=5)
            except asyncio.TimeoutError:
                _llama_server_process.kill()
        except Exception:
            pass
        _llama_server_process = None


async def _start_llama_server(profile: dict):
    """Start llama-server with the given profile's configuration."""
    global _llama_server_process

    # Kill any existing process
    _kill_llama_server()

    # Build command from profile flags
    cmd = ["llama-server"]
    cmd += ["--model", profile["model_path"]]
    cmd += ["--port", str(LLAMA_SERVER_PORT)]
    for key, value in profile.get("flags", {}).items():
        cmd += ["--" + key, str(value)]

    print(f"Starting llama-server: {' '.join(cmd)}")
    _llama_server_process = await asyncio.create_subprocess_exec(
        *cmd,
        stdout=asyncio.subprocess.DEVNULL,
        stderr=asyncio.subprocess.DEVNULL,
    )
    return _llama_server_process


async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):
    """Poll llama-server readiness via /v1/models endpoint."""
    import httpx

    for _ in range(max_retries):
        try:
            async with httpx.AsyncClient(timeout=2.0) as client:
                resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
                if resp.status_code == 200:
                    return True
        except Exception:
            pass
        await asyncio.sleep(interval)
    return False


@app.get("/models/available")
async def get_available_models():
    """Read manifest YAML and return list of profiles."""
    profiles = load_manifest(MANIFEST_PATH)
    if profiles is None:
        raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
    return profiles


@app.get("/models/status")
async def get_models_status():
    """Return current model status."""
    global _active_profile
    return {
        "active_profile": _active_profile,
        "llama_server_running": (
            _llama_server_process is not None and _llama_server_process.returncode is None
        ),
    }


class SwitchRequest(BaseModel):
    profile_id: str


@app.post("/models/switch")
async def switch_model(payload: SwitchRequest):
    """Stop current llama-server, start new one with the given profile, wait for readiness."""
    global _active_profile

    with _switch_lock:
        # Validate profile_id
        profiles = load_manifest(MANIFEST_PATH)
        if profiles is None:
            return JSONResponse(
                status_code=500,
                content={"status": "error", "message": "Failed to load manifest"},
            )

        profile = None
        for p in profiles:
            if p["id"] == payload.profile_id:
                profile = p
                break

        if profile is None:
            return JSONResponse(
                status_code=404,
                content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
            )

        # Already running this profile — just check readiness
        if _active_profile == payload.profile_id:
            return {
                "status": "ready",
                "active_profile": _active_profile,
            }

        # Start the new model
        _kill_llama_server()
        _active_profile = None
        await _start_llama_server(profile)

        # Poll for readiness
        ready = await _poll_llama_server_ready()
        if ready:
            _active_profile = payload.profile_id
            return {
                "status": "ready",
                "active_profile": _active_profile,
            }
        else:
            _active_profile = None
            return JSONResponse(
                status_code=500,
                content={"status": "error", "message": "llama-server failed to become ready"},
            )
Epic: Model Switching via Sidecar — Issues #2-#3 Issue #2: Manifest schema + Sidecar foundation - sidecar/manifest.py: YAML manifest loading and profile validation - sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints - Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list - Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total) Issue #3: Sidecar model switch + Router request queue - Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness - Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility) - Router request queue: max 10 requests, 120s hard timeout, 429 when full - Router automatic model detection: extracts model from chat body, matches against sidecar status - Full proxy endpoint with Sidecar → Main PC routing and fallback chain - Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total) Total: 33 tests, all passing 2026-06-15 03:49:24 +03:00			`"""Sidecar FastAPI service — Issue #2 foundation.`

			`Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.`
			`"""`
			`import os`
			`import asyncio`
			`import signal as signal_module`
			`import threading`
			`from contextlib import asynccontextmanager`
			`from typing import Optional`

			`from fastapi import FastAPI, HTTPException`
			`from fastapi.responses import JSONResponse`
			`from pydantic import BaseModel`

			`from sidecar.manifest import load_manifest`

			`# Configuration from environment`
			`MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")`
			`SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8081"))`
			`LLAMA_SERVER_PORT = 8080`

			`# Global state`
			`_llama_server_process: Optional[asyncio.subprocess.Process] = None`
			`_active_profile: Optional[str] = None`
			`_switch_lock = threading.Lock() # Use threading.Lock for compatibility with TestClient`


			`@asynccontextmanager`
			`async def lifespan(app: FastAPI):`
			`"""Manage sidecar lifecycle — no default model loaded."""`
			`print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}")`
			`yield`
			`# Cleanup: kill llama-server if running`
			`global _llama_server_process`
			`if _llama_server_process:`
			`_kill_llama_server()`


			`app = FastAPI(lifespan=lifespan)`


			`def _kill_llama_server():`
			`"""Kill the llama-server subprocess."""`
			`global _llama_server_process`
			`if _llama_server_process and _llama_server_process.returncode is None:`
			`try:`
			`_llama_server_process.send_signal(signal_module.SIGTERM)`
			`try:`
			`_llama_server_process.wait(timeout=5)`
			`except asyncio.TimeoutError:`
			`_llama_server_process.kill()`
			`except Exception:`
			`pass`
			`_llama_server_process = None`


			`async def _start_llama_server(profile: dict):`
			`"""Start llama-server with the given profile's configuration."""`
			`global _llama_server_process`

			`# Kill any existing process`
			`_kill_llama_server()`

			`# Build command from profile flags`
			`cmd = ["llama-server"]`
			`cmd += ["--model", profile["model_path"]]`
			`cmd += ["--port", str(LLAMA_SERVER_PORT)]`
			`for key, value in profile.get("flags", {}).items():`
			`cmd += ["--" + key, str(value)]`

			`print(f"Starting llama-server: {' '.join(cmd)}")`
			`_llama_server_process = await asyncio.create_subprocess_exec(`
			`*cmd,`
			`stdout=asyncio.subprocess.DEVNULL,`
			`stderr=asyncio.subprocess.DEVNULL,`
			`)`
			`return _llama_server_process`


			`async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):`
			`"""Poll llama-server readiness via /v1/models endpoint."""`
			`import httpx`

			`for _ in range(max_retries):`
			`try:`
			`async with httpx.AsyncClient(timeout=2.0) as client:`
			`resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")`
			`if resp.status_code == 200:`
			`return True`
			`except Exception:`
			`pass`
			`await asyncio.sleep(interval)`
			`return False`


			`@app.get("/models/available")`
			`async def get_available_models():`
			`"""Read manifest YAML and return list of profiles."""`
			`profiles = load_manifest(MANIFEST_PATH)`
			`if profiles is None:`
			`raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")`
			`return profiles`


			`@app.get("/models/status")`
			`async def get_models_status():`
			`"""Return current model status."""`
			`global _active_profile`
			`return {`
			`"active_profile": _active_profile,`
			`"llama_server_running": (`
			`_llama_server_process is not None and _llama_server_process.returncode is None`
			`),`
			`}`


			`class SwitchRequest(BaseModel):`
			`profile_id: str`


			`@app.post("/models/switch")`
			`async def switch_model(payload: SwitchRequest):`
			`"""Stop current llama-server, start new one with the given profile, wait for readiness."""`
			`global _active_profile`

			`with _switch_lock:`
			`# Validate profile_id`
			`profiles = load_manifest(MANIFEST_PATH)`
			`if profiles is None:`
			`return JSONResponse(`
			`status_code=500,`
			`content={"status": "error", "message": "Failed to load manifest"},`
			`)`

			`profile = None`
			`for p in profiles:`
			`if p["id"] == payload.profile_id:`
			`profile = p`
			`break`

			`if profile is None:`
			`return JSONResponse(`
			`status_code=404,`
			`content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},`
			`)`

			`# Already running this profile — just check readiness`
			`if _active_profile == payload.profile_id:`
			`return {`
			`"status": "ready",`
			`"active_profile": _active_profile,`
			`}`

			`# Start the new model`
			`_kill_llama_server()`
			`_active_profile = None`
			`await _start_llama_server(profile)`

			`# Poll for readiness`
			`ready = await _poll_llama_server_ready()`
			`if ready:`
			`_active_profile = payload.profile_id`
			`return {`
			`"status": "ready",`
			`"active_profile": _active_profile,`
			`}`
			`else:`
			`_active_profile = None`
			`return JSONResponse(`
			`status_code=500,`
			`content={"status": "error", "message": "llama-server failed to become ready"},`
			`)`