"""Sidecar FastAPI service — Issue #2 foundation. Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data. """ import os import asyncio import signal as signal_module import threading from contextlib import asynccontextmanager from typing import Optional from fastapi import FastAPI, HTTPException from fastapi.responses import JSONResponse from pydantic import BaseModel from sidecar.manifest import load_manifest # Configuration from environment MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml") SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080")) LLAMA_SERVER_PORT = 8081 # Global state _llama_server_process: Optional[asyncio.subprocess.Process] = None _active_profile: Optional[str] = None _switch_lock = threading.Lock() # Use threading.Lock for compatibility with TestClient @asynccontextmanager async def lifespan(app: FastAPI): """Manage sidecar lifecycle — no default model loaded.""" print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}", flush=True) yield # Cleanup: kill llama-server if running global _llama_server_process if _llama_server_process: _kill_llama_server() app = FastAPI(lifespan=lifespan) def _kill_llama_server(): """Kill the llama-server subprocess.""" global _llama_server_process if _llama_server_process and _llama_server_process.returncode is None: try: _llama_server_process.send_signal(signal_module.SIGTERM) try: _llama_server_process.wait(timeout=5) except asyncio.TimeoutError: _llama_server_process.kill() except Exception: pass _llama_server_process = None async def _start_llama_server(profile: dict): """Start llama-server with the given profile's configuration.""" global _llama_server_process # Kill any existing process _kill_llama_server() # Build command from profile flags cmd = ["/home/bigt/AI/llama.cpp/build/bin/llama-server"] cmd += ["--model", profile["model_path"]] cmd += ["--port", str(LLAMA_SERVER_PORT)] for key, value in profile.get("flags", {}).items(): cmd += ["--" + key, str(value)] print(f"Starting llama-server: {' '.join(cmd)}", flush=True) _llama_server_process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.DEVNULL, stderr=asyncio.subprocess.DEVNULL, ) return _llama_server_process async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5): """Poll llama-server readiness via /v1/models endpoint.""" import httpx for _ in range(max_retries): try: async with httpx.AsyncClient(timeout=2.0) as client: resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models") if resp.status_code == 200: return True except Exception: pass await asyncio.sleep(interval) return False @app.get("/models/available") async def get_available_models(): """Read manifest YAML and return list of profiles.""" profiles = load_manifest(MANIFEST_PATH) if profiles is None: raise HTTPException(status_code=500, detail="Failed to parse manifest YAML") return profiles @app.get("/models/status") async def get_models_status(): """Return current model status.""" global _active_profile return { "active_profile": _active_profile, "llama_server_running": ( _llama_server_process is not None and _llama_server_process.returncode is None ), } class SwitchRequest(BaseModel): profile_id: str @app.post("/models/switch") async def switch_model(payload: SwitchRequest): """Stop current llama-server, start new one with the given profile, wait for readiness.""" global _active_profile with _switch_lock: # Validate profile_id profiles = load_manifest(MANIFEST_PATH) if profiles is None: return JSONResponse( status_code=500, content={"status": "error", "message": "Failed to load manifest"}, ) profile = None for p in profiles: if p["id"] == payload.profile_id: profile = p break if profile is None: return JSONResponse( status_code=404, content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"}, ) # Already running this profile — just check readiness if _active_profile == payload.profile_id: return { "status": "ready", "active_profile": _active_profile, } # Start the new model _kill_llama_server() _active_profile = None await _start_llama_server(profile) # Poll for readiness ready = await _poll_llama_server_ready() if ready: _active_profile = payload.profile_id return { "status": "ready", "active_profile": _active_profile, } else: _active_profile = None return JSONResponse( status_code=500, content={"status": "error", "message": "llama-server failed to become ready"}, )