intelligence-router/sidecar/app.py
root 7e86a30bd8 fix: resolve port conflict between sidecar and llama-server
Sidecar and llama-server were both configured on port 8080, causing
llama-server to fail on startup (port already in use).

- sidecar/app.py: LLAMA_SERVER_PORT → 8081 (sidecar stays on 8080)
- docker-compose.yml: MAIN_PC_URL → port 8081 (router sends chat
  requests to llama-server, not the sidecar)
2026-06-15 15:31:31 +00:00

174 lines
5.3 KiB
Python

"""Sidecar FastAPI service — Issue #2 foundation.
Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
"""
import os
import asyncio
import signal as signal_module
import threading
from contextlib import asynccontextmanager
from typing import Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from sidecar.manifest import load_manifest
# Configuration from environment
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
LLAMA_SERVER_PORT = 8081
# Global state
_llama_server_process: Optional[asyncio.subprocess.Process] = None
_active_profile: Optional[str] = None
_switch_lock = threading.Lock() # Use threading.Lock for compatibility with TestClient
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Manage sidecar lifecycle — no default model loaded."""
print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}")
yield
# Cleanup: kill llama-server if running
global _llama_server_process
if _llama_server_process:
_kill_llama_server()
app = FastAPI(lifespan=lifespan)
def _kill_llama_server():
"""Kill the llama-server subprocess."""
global _llama_server_process
if _llama_server_process and _llama_server_process.returncode is None:
try:
_llama_server_process.send_signal(signal_module.SIGTERM)
try:
_llama_server_process.wait(timeout=5)
except asyncio.TimeoutError:
_llama_server_process.kill()
except Exception:
pass
_llama_server_process = None
async def _start_llama_server(profile: dict):
"""Start llama-server with the given profile's configuration."""
global _llama_server_process
# Kill any existing process
_kill_llama_server()
# Build command from profile flags
cmd = ["/home/bigt/AI/llama.cpp/bin/llama-server"]
cmd += ["--model", profile["model_path"]]
cmd += ["--port", str(LLAMA_SERVER_PORT)]
for key, value in profile.get("flags", {}).items():
cmd += ["--" + key, str(value)]
print(f"Starting llama-server: {' '.join(cmd)}")
_llama_server_process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.DEVNULL,
)
return _llama_server_process
async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):
"""Poll llama-server readiness via /v1/models endpoint."""
import httpx
for _ in range(max_retries):
try:
async with httpx.AsyncClient(timeout=2.0) as client:
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
if resp.status_code == 200:
return True
except Exception:
pass
await asyncio.sleep(interval)
return False
@app.get("/models/available")
async def get_available_models():
"""Read manifest YAML and return list of profiles."""
profiles = load_manifest(MANIFEST_PATH)
if profiles is None:
raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
return profiles
@app.get("/models/status")
async def get_models_status():
"""Return current model status."""
global _active_profile
return {
"active_profile": _active_profile,
"llama_server_running": (
_llama_server_process is not None and _llama_server_process.returncode is None
),
}
class SwitchRequest(BaseModel):
profile_id: str
@app.post("/models/switch")
async def switch_model(payload: SwitchRequest):
"""Stop current llama-server, start new one with the given profile, wait for readiness."""
global _active_profile
with _switch_lock:
# Validate profile_id
profiles = load_manifest(MANIFEST_PATH)
if profiles is None:
return JSONResponse(
status_code=500,
content={"status": "error", "message": "Failed to load manifest"},
)
profile = None
for p in profiles:
if p["id"] == payload.profile_id:
profile = p
break
if profile is None:
return JSONResponse(
status_code=404,
content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
)
# Already running this profile — just check readiness
if _active_profile == payload.profile_id:
return {
"status": "ready",
"active_profile": _active_profile,
}
# Start the new model
_kill_llama_server()
_active_profile = None
await _start_llama_server(profile)
# Poll for readiness
ready = await _poll_llama_server_ready()
if ready:
_active_profile = payload.profile_id
return {
"status": "ready",
"active_profile": _active_profile,
}
else:
_active_profile = None
return JSONResponse(
status_code=500,
content={"status": "error", "message": "llama-server failed to become ready"},
)