Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
"""Sidecar FastAPI service — Issue #2 foundation.
|
|
|
|
|
|
|
|
|
|
Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
|
|
|
|
|
"""
|
|
|
|
|
import os
|
|
|
|
|
import asyncio
|
|
|
|
|
import signal as signal_module
|
|
|
|
|
import threading
|
|
|
|
|
from contextlib import asynccontextmanager
|
|
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
|
|
|
from fastapi.responses import JSONResponse
|
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
|
|
|
|
from sidecar.manifest import load_manifest
|
|
|
|
|
|
|
|
|
|
# Configuration from environment
|
|
|
|
|
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
|
2026-06-15 16:16:47 +03:00
|
|
|
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
|
2026-06-15 18:31:31 +03:00
|
|
|
LLAMA_SERVER_PORT = 8081
|
2026-06-16 03:06:45 +03:00
|
|
|
LLAMA_STDERR_LOG = "/tmp/llama-server-stderr.log"
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
|
|
|
|
|
# Global state
|
|
|
|
|
_llama_server_process: Optional[asyncio.subprocess.Process] = None
|
|
|
|
|
_active_profile: Optional[str] = None
|
|
|
|
|
_switch_lock = threading.Lock() # Use threading.Lock for compatibility with TestClient
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@asynccontextmanager
|
|
|
|
|
async def lifespan(app: FastAPI):
|
|
|
|
|
"""Manage sidecar lifecycle — no default model loaded."""
|
2026-06-15 19:25:58 +03:00
|
|
|
print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}", flush=True)
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
yield
|
|
|
|
|
# Cleanup: kill llama-server if running
|
|
|
|
|
global _llama_server_process
|
|
|
|
|
if _llama_server_process:
|
|
|
|
|
_kill_llama_server()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI(lifespan=lifespan)
|
|
|
|
|
|
|
|
|
|
|
2026-06-16 03:06:45 +03:00
|
|
|
def _close_stderr_log():
|
|
|
|
|
"""Close the stderr log file handle if it's still attached to the process."""
|
|
|
|
|
global _llama_server_process
|
|
|
|
|
if _llama_server_process is not None:
|
|
|
|
|
fh = getattr(_llama_server_process, "_stderr_fh", None)
|
|
|
|
|
if fh is not None and not fh.closed:
|
|
|
|
|
try:
|
|
|
|
|
fh.close()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
def _kill_llama_server():
|
2026-06-16 03:06:45 +03:00
|
|
|
"""Kill the llama-server subprocess and close its stderr log handle."""
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
global _llama_server_process
|
|
|
|
|
if _llama_server_process and _llama_server_process.returncode is None:
|
|
|
|
|
try:
|
|
|
|
|
_llama_server_process.send_signal(signal_module.SIGTERM)
|
|
|
|
|
try:
|
|
|
|
|
_llama_server_process.wait(timeout=5)
|
|
|
|
|
except asyncio.TimeoutError:
|
|
|
|
|
_llama_server_process.kill()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
_llama_server_process = None
|
|
|
|
|
|
2026-06-16 03:06:45 +03:00
|
|
|
# Close stderr log handle if still open
|
|
|
|
|
_close_stderr_log()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _flag_value(value) -> str:
|
|
|
|
|
"""Convert a manifest flag value to a llama-server CLI argument string.
|
|
|
|
|
|
|
|
|
|
YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
|
|
|
|
|
safe_load. llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
|
|
|
|
|
"""
|
|
|
|
|
if isinstance(value, bool):
|
|
|
|
|
return "on" if value else "off"
|
|
|
|
|
return str(value)
|
|
|
|
|
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
|
|
|
|
|
async def _start_llama_server(profile: dict):
|
|
|
|
|
"""Start llama-server with the given profile's configuration."""
|
|
|
|
|
global _llama_server_process
|
|
|
|
|
|
|
|
|
|
# Kill any existing process
|
|
|
|
|
_kill_llama_server()
|
|
|
|
|
|
|
|
|
|
# Build command from profile flags
|
2026-06-15 19:01:53 +03:00
|
|
|
cmd = ["/home/bigt/AI/llama.cpp/build/bin/llama-server"]
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
cmd += ["--model", profile["model_path"]]
|
|
|
|
|
cmd += ["--port", str(LLAMA_SERVER_PORT)]
|
|
|
|
|
for key, value in profile.get("flags", {}).items():
|
2026-06-16 03:06:45 +03:00
|
|
|
cmd += ["--" + key, _flag_value(value)]
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
|
2026-06-15 19:25:58 +03:00
|
|
|
print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
|
2026-06-16 03:06:45 +03:00
|
|
|
|
|
|
|
|
# Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
|
|
|
|
|
stderr_fh = open(LLAMA_STDERR_LOG, "w")
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
_llama_server_process = await asyncio.create_subprocess_exec(
|
|
|
|
|
*cmd,
|
|
|
|
|
stdout=asyncio.subprocess.DEVNULL,
|
2026-06-16 03:06:45 +03:00
|
|
|
stderr=stderr_fh,
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
)
|
2026-06-16 03:06:45 +03:00
|
|
|
# Keep a reference so we can close the handle later
|
|
|
|
|
_llama_server_process._stderr_fh = stderr_fh # type: ignore[attr-defined]
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
return _llama_server_process
|
|
|
|
|
|
|
|
|
|
|
2026-06-16 03:06:45 +03:00
|
|
|
async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
|
|
|
|
|
"""Poll llama-server readiness via /v1/models endpoint.
|
|
|
|
|
|
|
|
|
|
Returns True on success. On failure, dumps the captured stderr (if any)
|
|
|
|
|
so the user can see why llama-server crashed.
|
|
|
|
|
"""
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
import httpx
|
|
|
|
|
|
2026-06-16 03:06:45 +03:00
|
|
|
for attempt in range(max_retries):
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
try:
|
|
|
|
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
|
|
|
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
return True
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
await asyncio.sleep(interval)
|
2026-06-16 03:06:45 +03:00
|
|
|
|
|
|
|
|
# ── Dump stderr for diagnosis ──────────────────────────────────────
|
|
|
|
|
print("llama-server did NOT become ready — dumping stderr:", flush=True)
|
|
|
|
|
try:
|
|
|
|
|
with open(LLAMA_STDERR_LOG) as f:
|
|
|
|
|
for line in f:
|
|
|
|
|
print(f" {line.rstrip()}", flush=True)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(" (stderr log not found — process may not have started)", flush=True)
|
|
|
|
|
|
|
|
|
|
# Also log exit code if the process died
|
|
|
|
|
global _llama_server_process
|
|
|
|
|
if _llama_server_process and _llama_server_process.returncode is not None:
|
|
|
|
|
print(
|
|
|
|
|
f"llama-server exited with code {_llama_server_process.returncode}",
|
|
|
|
|
flush=True,
|
|
|
|
|
)
|
|
|
|
|
|
Epic: Model Switching via Sidecar — Issues #2-#3
Issue #2: Manifest schema + Sidecar foundation
- sidecar/manifest.py: YAML manifest loading and profile validation
- sidecar/app.py: FastAPI sidecar service with /models/available, /models/status endpoints
- Router GET /v1/models: proxies to sidecar, returns OpenAI-compatible model list
- Tests: 12 manifest tests, 6 sidecar endpoint tests, 3 router tests (21 total)
Issue #3: Sidecar model switch + Router request queue
- Sidecar POST /models/switch: stops current llama-server, starts new one, polls for readiness
- Switch lock prevents concurrent switches (threading.Lock for TestClient compatibility)
- Router request queue: max 10 requests, 120s hard timeout, 429 when full
- Router automatic model detection: extracts model from chat body, matches against sidecar status
- Full proxy endpoint with Sidecar → Main PC routing and fallback chain
- Tests: 5 sidecar switch tests, 4 queue tests, 3 router integration tests (12 total)
Total: 33 tests, all passing
2026-06-15 03:49:24 +03:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/models/available")
|
|
|
|
|
async def get_available_models():
|
|
|
|
|
"""Read manifest YAML and return list of profiles."""
|
|
|
|
|
profiles = load_manifest(MANIFEST_PATH)
|
|
|
|
|
if profiles is None:
|
|
|
|
|
raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
|
|
|
|
|
return profiles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/models/status")
|
|
|
|
|
async def get_models_status():
|
|
|
|
|
"""Return current model status."""
|
|
|
|
|
global _active_profile
|
|
|
|
|
return {
|
|
|
|
|
"active_profile": _active_profile,
|
|
|
|
|
"llama_server_running": (
|
|
|
|
|
_llama_server_process is not None and _llama_server_process.returncode is None
|
|
|
|
|
),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SwitchRequest(BaseModel):
|
|
|
|
|
profile_id: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/models/switch")
|
|
|
|
|
async def switch_model(payload: SwitchRequest):
|
|
|
|
|
"""Stop current llama-server, start new one with the given profile, wait for readiness."""
|
|
|
|
|
global _active_profile
|
|
|
|
|
|
|
|
|
|
with _switch_lock:
|
|
|
|
|
# Validate profile_id
|
|
|
|
|
profiles = load_manifest(MANIFEST_PATH)
|
|
|
|
|
if profiles is None:
|
|
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=500,
|
|
|
|
|
content={"status": "error", "message": "Failed to load manifest"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
profile = None
|
|
|
|
|
for p in profiles:
|
|
|
|
|
if p["id"] == payload.profile_id:
|
|
|
|
|
profile = p
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if profile is None:
|
|
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=404,
|
|
|
|
|
content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Already running this profile — just check readiness
|
|
|
|
|
if _active_profile == payload.profile_id:
|
|
|
|
|
return {
|
|
|
|
|
"status": "ready",
|
|
|
|
|
"active_profile": _active_profile,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Start the new model
|
|
|
|
|
_kill_llama_server()
|
|
|
|
|
_active_profile = None
|
|
|
|
|
await _start_llama_server(profile)
|
|
|
|
|
|
|
|
|
|
# Poll for readiness
|
|
|
|
|
ready = await _poll_llama_server_ready()
|
|
|
|
|
if ready:
|
|
|
|
|
_active_profile = payload.profile_id
|
|
|
|
|
return {
|
|
|
|
|
"status": "ready",
|
|
|
|
|
"active_profile": _active_profile,
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
_active_profile = None
|
|
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=500,
|
|
|
|
|
content={"status": "error", "message": "llama-server failed to become ready"},
|
|
|
|
|
)
|