intelligence-router/sidecar/app.py

"""Sidecar FastAPI service — Issue #2 foundation.

Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
"""
import os
import asyncio
import signal as signal_module
import threading
from contextlib import asynccontextmanager
from typing import Optional

from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel

from sidecar.manifest import load_manifest

# Configuration from environment
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
LLAMA_SERVER_PORT = 8081

# Global state
_llama_server_process: Optional[asyncio.subprocess.Process] = None
_active_profile: Optional[str] = None
_switch_lock = threading.Lock()  # Use threading.Lock for compatibility with TestClient


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Manage sidecar lifecycle — no default model loaded."""
    print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}")
    yield
    # Cleanup: kill llama-server if running
    global _llama_server_process
    if _llama_server_process:
        _kill_llama_server()


app = FastAPI(lifespan=lifespan)


def _kill_llama_server():
    """Kill the llama-server subprocess."""
    global _llama_server_process
    if _llama_server_process and _llama_server_process.returncode is None:
        try:
            _llama_server_process.send_signal(signal_module.SIGTERM)
            try:
                _llama_server_process.wait(timeout=5)
            except asyncio.TimeoutError:
                _llama_server_process.kill()
        except Exception:
            pass
        _llama_server_process = None


async def _start_llama_server(profile: dict):
    """Start llama-server with the given profile's configuration."""
    global _llama_server_process

    # Kill any existing process
    _kill_llama_server()

    # Build command from profile flags
    cmd = ["/home/bigt/AI/llama.cpp/bin/llama-server"]
    cmd += ["--model", profile["model_path"]]
    cmd += ["--port", str(LLAMA_SERVER_PORT)]
    for key, value in profile.get("flags", {}).items():
        cmd += ["--" + key, str(value)]

    print(f"Starting llama-server: {' '.join(cmd)}")
    _llama_server_process = await asyncio.create_subprocess_exec(
        *cmd,
        stdout=asyncio.subprocess.DEVNULL,
        stderr=asyncio.subprocess.DEVNULL,
    )
    return _llama_server_process


async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):
    """Poll llama-server readiness via /v1/models endpoint."""
    import httpx

    for _ in range(max_retries):
        try:
            async with httpx.AsyncClient(timeout=2.0) as client:
                resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
                if resp.status_code == 200:
                    return True
        except Exception:
            pass
        await asyncio.sleep(interval)
    return False


@app.get("/models/available")
async def get_available_models():
    """Read manifest YAML and return list of profiles."""
    profiles = load_manifest(MANIFEST_PATH)
    if profiles is None:
        raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
    return profiles


@app.get("/models/status")
async def get_models_status():
    """Return current model status."""
    global _active_profile
    return {
        "active_profile": _active_profile,
        "llama_server_running": (
            _llama_server_process is not None and _llama_server_process.returncode is None
        ),
    }


class SwitchRequest(BaseModel):
    profile_id: str


@app.post("/models/switch")
async def switch_model(payload: SwitchRequest):
    """Stop current llama-server, start new one with the given profile, wait for readiness."""
    global _active_profile

    with _switch_lock:
        # Validate profile_id
        profiles = load_manifest(MANIFEST_PATH)
        if profiles is None:
            return JSONResponse(
                status_code=500,
                content={"status": "error", "message": "Failed to load manifest"},
            )

        profile = None
        for p in profiles:
            if p["id"] == payload.profile_id:
                profile = p
                break

        if profile is None:
            return JSONResponse(
                status_code=404,
                content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
            )

        # Already running this profile — just check readiness
        if _active_profile == payload.profile_id:
            return {
                "status": "ready",
                "active_profile": _active_profile,
            }

        # Start the new model
        _kill_llama_server()
        _active_profile = None
        await _start_llama_server(profile)

        # Poll for readiness
        ready = await _poll_llama_server_ready()
        if ready:
            _active_profile = payload.profile_id
            return {
                "status": "ready",
                "active_profile": _active_profile,
            }
        else:
            _active_profile = None
            return JSONResponse(
                status_code=500,
                content={"status": "error", "message": "llama-server failed to become ready"},
            )