fix: convert underscores to hyphens in llama-server flag names, fix n_ctx→ctx-size rename
Two changes to fix 'error: invalid argument: --n-ctx' during model switch: 1. sidecar/app.py: Added _flag_key() converter that normalises underscores to hyphens in flag names and handles the n_ctx→ctx-size rename. The code now converts e.g. n_gpu_layers → n-gpu-layers, top_p → top-p, top_k → top-k, min_p → min-p before passing to llama-server CLI. 2. deploy/manifest.yaml: Updated all 20 profiles to use correct llama-server flag names: n_ctx→ctx-size, n_gpu_layers→n-gpu-layers, top_p→top-p, top_k→top-k, min_p→min-p. All flags now use hyphens, matching what llama-server actually accepts.
This commit is contained in:
parent
1551c281c2
commit
4ee85972ec
@ -17,48 +17,48 @@
|
||||
name: "Qwen3.6-27B Balanced 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: qwen36-27b-thinking-64k
|
||||
name: "Qwen3.6-27B Thinking 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: qwen36-27b-extended-128k
|
||||
name: "Qwen3.6-27B Extended 128K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 131072
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 131072
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q4_0
|
||||
cache-type-v: q4_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.05
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
# --- Gemma 4 12B (Q6_K_XL ~8.5 GB) ---
|
||||
@ -67,32 +67,32 @@
|
||||
name: "Gemma4 12B Standard Q6 64K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-12b-it-UD-Q6_K_XL.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: gemma4-12b-extended-q6-128k
|
||||
name: "Gemma4 12B Extended Q6 128K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-12b-it-UD-Q6_K_XL.gguf"
|
||||
flags:
|
||||
n_ctx: 131072
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 131072
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q4_0
|
||||
cache-type-v: q4_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
# --- Gemma 4 26B-A4B (Q4_K_M ~10.5 GB, IQ4_XS ~6 GB) ---
|
||||
@ -101,64 +101,64 @@
|
||||
name: "Gemma4 26B Balanced 64K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: gemma4-26b-extended-128k
|
||||
name: "Gemma4 26B Extended 128K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 131072
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 131072
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q4_0
|
||||
cache-type-v: q4_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.15
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: gemma4-26b-ultra-long-iq4-128k
|
||||
name: "Gemma4 26B Ultra-Long IQ4 128K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-IQ4_XS.gguf"
|
||||
flags:
|
||||
n_ctx: 131072
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 131072
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q4_0
|
||||
cache-type-v: q4_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: gemma4-26b-q5-64k
|
||||
name: "Gemma4 26B Q5 64K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
# --- Gemma 4 26B Compact (IQ4_XS ~6 GB) ---
|
||||
@ -166,32 +166,32 @@
|
||||
name: "Gemma4 26B Compact IQ4 64K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: gemma4-26b-compact-long-128k
|
||||
name: "Gemma4 26B Compact IQ4 128K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
|
||||
flags:
|
||||
n_ctx: 131072
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 131072
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q4_0
|
||||
cache-type-v: q4_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
# --- Qwen3.6-35B-A3B (UD-Q4_K_M ~14 GB) ---
|
||||
@ -201,48 +201,48 @@
|
||||
name: "Qwen3.6-35B Fast 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: qwen36-35b-thinking-64k
|
||||
name: "Qwen3.6-35B Thinking 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: qwen36-35b-extended-128k
|
||||
name: "Qwen3.6-35B Extended 128K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 131072
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 131072
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q4_0
|
||||
cache-type-v: q4_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
# --- Qwen3.6-35B-A3B MTP variant ---
|
||||
@ -250,32 +250,32 @@
|
||||
name: "Qwen3.6-35B MTP Fast 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: qwen36-35b-mtp-extended-128k
|
||||
name: "Qwen3.6-35B MTP Extended 128K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 131072
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 131072
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q4_0
|
||||
cache-type-v: q4_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
# --- Uncensored models ---
|
||||
@ -283,62 +283,62 @@
|
||||
name: "Qwen3.6-35B HauhauCS Aggressive 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: qwen36-35b-genesis-apex-64k
|
||||
name: "Qwen3.6-35B Genesis APEX 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-Genesis-APEX.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: qwen36-35b-genesis-mtp-apex-64k
|
||||
name: "Qwen3.6-35B Genesis MTP APEX 64K"
|
||||
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-Genesis-MTP-APEX.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 0.6
|
||||
top_p: 0.95
|
||||
top_k: 20
|
||||
top-p: 0.95
|
||||
top-k: 20
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
|
||||
- id: gemma4-26b-hauhau-balanced-64k
|
||||
name: "Gemma4 26B HauhauCS Balanced 64K"
|
||||
model_path: "/home/bigt/AI/llm/gemma4/Gemma4-26B-A4B-Uncensored-HauhauCS-Balanced-Q5_K_M.gguf"
|
||||
flags:
|
||||
n_ctx: 65536
|
||||
n_gpu_layers: 999
|
||||
ctx-size: 65536
|
||||
n-gpu-layers: 999
|
||||
cache-type-k: q8_0
|
||||
cache-type-v: q8_0
|
||||
flash-attn: on
|
||||
temp: 1.0
|
||||
top_p: 0.95
|
||||
top_k: 64
|
||||
top-p: 0.95
|
||||
top-k: 64
|
||||
repeat-penalty: 1.0
|
||||
min_p: 0.0
|
||||
min-p: 0.0
|
||||
presence-penalty: 0.0
|
||||
@ -84,6 +84,22 @@ def _flag_value(value) -> str:
|
||||
return str(value)
|
||||
|
||||
|
||||
def _flag_key(key: str) -> str:
|
||||
"""Convert a manifest flag key to the correct llama-server CLI flag name.
|
||||
|
||||
llama-server uses hyphenated flag names (--ctx-size, --n-gpu-layers),
|
||||
but YAML keys often use underscores. Some flags were also renamed
|
||||
across llama.cpp versions (e.g. --n-ctx → --ctx-size).
|
||||
|
||||
This function normalises underscores to hyphens and applies known renames.
|
||||
"""
|
||||
normalized = key.replace("_", "-")
|
||||
FLAG_RENAMES = {
|
||||
"n-ctx": "ctx-size",
|
||||
}
|
||||
return FLAG_RENAMES.get(normalized, normalized)
|
||||
|
||||
|
||||
async def _start_llama_server(profile: dict):
|
||||
"""Start llama-server with the given profile's configuration."""
|
||||
global _llama_server_process
|
||||
@ -96,7 +112,7 @@ async def _start_llama_server(profile: dict):
|
||||
cmd += ["--model", profile["model_path"]]
|
||||
cmd += ["--port", str(LLAMA_SERVER_PORT)]
|
||||
for key, value in profile.get("flags", {}).items():
|
||||
cmd += ["--" + key, _flag_value(value)]
|
||||
cmd += ["--" + _flag_key(key), _flag_value(value)]
|
||||
|
||||
print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user