intelligence-router/sidecar/app.py

"""Sidecar FastAPI service — Issue #2 foundation.

Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
"""
import os
import asyncio
import signal as signal_module
from contextlib import asynccontextmanager
from typing import Optional

from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel

from sidecar.manifest import load_manifest

# Configuration from environment
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
LLAMA_SERVER_PORT = 8081
LLAMA_STDERR_LOG = os.path.join(
    os.path.dirname(MANIFEST_PATH), "llama-server-stderr.log"
)

# Global state
_llama_server_process: Optional[asyncio.subprocess.Process] = None
_active_profile: Optional[str] = None
_switch_lock = asyncio.Lock()  # Use asyncio.Lock to avoid blocking the event loop


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Manage sidecar lifecycle — no default model loaded."""
    print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}", flush=True)
    yield
    # Cleanup: kill llama-server if running
    global _llama_server_process
    if _llama_server_process:
        await _kill_llama_server()


app = FastAPI(lifespan=lifespan)


def _close_stderr_log():
    """Close the stderr log file handle if it's still attached to the process."""
    global _llama_server_process
    if _llama_server_process is not None:
        fh = getattr(_llama_server_process, "_stderr_fh", None)
        if fh is not None and not fh.closed:
            try:
                fh.close()
            except Exception:
                pass


async def _kill_llama_server():
    """Kill the llama-server subprocess and wait for it to fully terminate.

    This MUST be async because process.wait() is a coroutine. The synchronous
    version was calling .wait() without await, creating an unawaited coroutine
    object — the old process was never actually waited on, so it could still
    hold GPU VRAM when the new server started.
    """
    global _llama_server_process
    if _llama_server_process is None or _llama_server_process.returncode is not None:
        _close_stderr_log()
        return

    try:
        _llama_server_process.send_signal(signal_module.SIGTERM)
        try:
            await asyncio.wait_for(_llama_server_process.wait(), timeout=10)
        except asyncio.TimeoutError:
            _llama_server_process.kill()
            try:
                await asyncio.wait_for(_llama_server_process.wait(), timeout=5)
            except asyncio.TimeoutError:
                pass
    except Exception:
        pass
    finally:
        _llama_server_process = None
        _close_stderr_log()


def _flag_value(value) -> str:
    """Convert a manifest flag value to a llama-server CLI argument string.

    YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
    safe_load.  llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
    """
    if isinstance(value, bool):
        return "on" if value else "off"
    return str(value)


def _flag_key(key: str) -> str:
    """Convert a manifest flag key to the correct llama-server CLI flag name.

    llama-server uses hyphenated flag names (--ctx-size, --n-gpu-layers),
    but YAML keys often use underscores.  Some flags were also renamed
    across llama.cpp versions (e.g. --n-ctx → --ctx-size).

    This function normalises underscores to hyphens and applies known renames.
    """
    normalized = key.replace("_", "-")
    FLAG_RENAMES = {
        "n-ctx": "ctx-size",
    }
    return FLAG_RENAMES.get(normalized, normalized)


async def _start_llama_server(profile: dict):
    """Start llama-server with the given profile's configuration."""
    global _llama_server_process

    # Kill any existing process
    await _kill_llama_server()

    # Build command from profile flags
    cmd = ["/home/bigt/AI/llama.cpp/build/bin/llama-server"]
    cmd += ["--model", profile["model_path"]]
    cmd += ["--port", str(LLAMA_SERVER_PORT)]
    cmd += ["--host", "0.0.0.0"]
    for key, value in profile.get("flags", {}).items():
        cmd += ["--" + _flag_key(key), _flag_value(value)]

    print(f"Starting llama-server: {' '.join(cmd)}", flush=True)

    # Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
    stderr_fh = open(LLAMA_STDERR_LOG, "w")
    _llama_server_process = await asyncio.create_subprocess_exec(
        *cmd,
        stdout=asyncio.subprocess.DEVNULL,
        stderr=stderr_fh,
    )
    # Keep a reference so we can close the handle later
    _llama_server_process._stderr_fh = stderr_fh  # type: ignore[attr-defined]
    return _llama_server_process


async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
    """Poll llama-server readiness via /v1/models endpoint.

    Returns True on success.  On failure, dumps the captured stderr (if any)
    so the user can see why llama-server crashed.
    """
    import httpx

    for attempt in range(max_retries):
        try:
            async with httpx.AsyncClient(timeout=2.0) as client:
                resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
                if resp.status_code == 200:
                    return True
        except Exception:
            pass
        await asyncio.sleep(interval)

    # Flush and close the stderr handle so all data is on disk before we read
    _close_stderr_log()

    # ── Dump stderr for diagnosis ──────────────────────────────────────
    print("llama-server did NOT become ready — dumping stderr:", flush=True)
    try:
        with open(LLAMA_STDERR_LOG) as f:
            for line in f:
                print(f"  {line.rstrip()}", flush=True)
    except FileNotFoundError:
        print("  (stderr log not found — process may not have started)", flush=True)

    # Also log exit code if the process died
    global _llama_server_process
    if _llama_server_process and _llama_server_process.returncode is not None:
        print(
            f"llama-server exited with code {_llama_server_process.returncode}",
            flush=True,
        )

    return False


@app.get("/models/available")
async def get_available_models():
    """Read manifest YAML and return list of profiles."""
    profiles = load_manifest(MANIFEST_PATH)
    if profiles is None:
        raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
    return profiles


@app.get("/models/status")
async def get_models_status():
    """Return current model status."""
    global _active_profile
    return {
        "active_profile": _active_profile,
        "llama_server_running": (
            _llama_server_process is not None and _llama_server_process.returncode is None
        ),
    }


class SwitchRequest(BaseModel):
    profile_id: str


@app.post("/models/switch")
async def switch_model(payload: SwitchRequest):
    """Stop current llama-server, start new one with the given profile, wait for readiness."""
    global _active_profile

    async with _switch_lock:
        # Validate profile_id
        profiles = load_manifest(MANIFEST_PATH)
        if profiles is None:
            return JSONResponse(
                status_code=500,
                content={"status": "error", "message": "Failed to load manifest"},
            )

        profile = None
        for p in profiles:
            if p["id"] == payload.profile_id:
                profile = p
                break

        if profile is None:
            return JSONResponse(
                status_code=404,
                content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
            )

        # Already running this profile — just check readiness
        if _active_profile == payload.profile_id:
            return {
                "status": "ready",
                "active_profile": _active_profile,
            }

        # Start the new model
        await _kill_llama_server()
        _active_profile = None
        await _start_llama_server(profile)

        # Poll for readiness
        ready = await _poll_llama_server_ready()
        if ready:
            _active_profile = payload.profile_id
            return {
                "status": "ready",
                "active_profile": _active_profile,
            }
        else:
            _active_profile = None
            return JSONResponse(
                status_code=500,
                content={"status": "error", "message": "llama-server failed to become ready"},
            )