fix: remove non-existent models from manifest (qwen-3-8b, llama-4-maverick), add 3 newly discovered models

2026-06-15 16:38:17 +00:00 · 2026-06-15 16:38:17 +00:00 · 95c87a764b
commit 95c87a764b
parent 36abbf573e
1 changed files with 101 additions and 56 deletions
--- a/deploy/manifest.yaml
+++ b/deploy/manifest.yaml
@ -11,27 +11,6 @@
 # All profiles use flash-attn: on, n-gpu-layers: 999 (offload all)
 # KV cache quantization (q8_0/q4_0) enables 64K+ context within 24GB VRAM

- id: qwen-3-8b
-  name: "Qwen 3 8B"
-  model_path: "/home/bigt/AI/llm/qwen/qwen3-8b-q4.gguf"
-  flags:
-    n_ctx: 8192
-    n_gpu_layers: 35
-
- id: qwen-3-8b-long
-  name: "Qwen 3 8B (Long Context)"
-  model_path: "/home/bigt/AI/llm/qwen/qwen3-8b-q4.gguf"
-  flags:
-    n_ctx: 32768
-    n_gpu_layers: 20
-
- id: llama-4-maverick
-  name: "Llama 4 Maverick"
-  model_path: "/home/bigt/AI/llm/llama4/llama4-maverick-q4.gguf"
-  flags:
-    n_ctx: 8192
-    n_gpu_layers: 35
-
 # --- Qwen3.6-27B (Q4_K_M, ~10.5 GB) ---
 # Sampling: temp 0.6/1.0, top_p 0.95, top_k 20
 - id: qwen36-27b-balanced-64k
@ -82,7 +61,7 @@
    min_p: 0.0
    presence-penalty: 0.0

-# --- Gemma 4 12B (Q6_K_XL ~8.5 GB, IQ4_XS ~5 GB) ---
+# --- Gemma 4 12B (Q6_K_XL ~8.5 GB) ---
 # Sampling: temp 1.0, top_p 0.95, top_k 64 (Google official)
 - id: gemma4-12b-standard-q6-64k
  name: "Gemma4 12B Standard Q6 64K"
@ -116,38 +95,6 @@
    min_p: 0.0
    presence-penalty: 0.0

- id: gemma4-12b-compact-iq4-64k
-  name: "Gemma4 12B Compact IQ4 64K"
-  model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
-  flags:
-    n_ctx: 65536
-    n_gpu_layers: 999
-    cache-type-k: q8_0
-    cache-type-v: q8_0
-    flash-attn: on
-    temp: 1.0
-    top_p: 0.95
-    top_k: 64
-    repeat-penalty: 1.0
-    min_p: 0.0
-    presence-penalty: 0.0
-
- id: gemma4-12b-compact-long-128k
-  name: "Gemma4 12B Compact IQ4 128K"
-  model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
-  flags:
-    n_ctx: 131072
-    n_gpu_layers: 999
-    cache-type-k: q8_0
-    cache-type-v: q8_0
-    flash-attn: on
-    temp: 1.0
-    top_p: 0.95
-    top_k: 64
-    repeat-penalty: 1.0
-    min_p: 0.0
-    presence-penalty: 0.0
-
 # --- Gemma 4 26B-A4B (Q4_K_M ~10.5 GB, IQ4_XS ~6 GB) ---
 # MoE, 4B active params. Same sampling as 12B family.
 - id: gemma4-26b-balanced-64k
@ -198,6 +145,55 @@
    min_p: 0.0
    presence-penalty: 0.0

+- id: gemma4-26b-q5-64k
+  name: "Gemma4 26B Q5 64K"
+  model_path: "/home/bigt/AI/llm/gemma4/google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
+  flags:
+    n_ctx: 65536
+    n_gpu_layers: 999
+    cache-type-k: q8_0
+    cache-type-v: q8_0
+    flash-attn: on
+    temp: 1.0
+    top_p: 0.95
+    top_k: 64
+    repeat-penalty: 1.0
+    min_p: 0.0
+    presence-penalty: 0.0
+
+# --- Gemma 4 26B Compact (IQ4_XS ~6 GB) ---
+- id: gemma4-26b-compact-iq4-64k
+  name: "Gemma4 26B Compact IQ4 64K"
+  model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
+  flags:
+    n_ctx: 65536
+    n_gpu_layers: 999
+    cache-type-k: q8_0
+    cache-type-v: q8_0
+    flash-attn: on
+    temp: 1.0
+    top_p: 0.95
+    top_k: 64
+    repeat-penalty: 1.0
+    min_p: 0.0
+    presence-penalty: 0.0
+
+- id: gemma4-26b-compact-long-128k
+  name: "Gemma4 26B Compact IQ4 128K"
+  model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
+  flags:
+    n_ctx: 131072
+    n_gpu_layers: 999
+    cache-type-k: q8_0
+    cache-type-v: q8_0
+    flash-attn: on
+    temp: 1.0
+    top_p: 0.95
+    top_k: 64
+    repeat-penalty: 1.0
+    min_p: 0.0
+    presence-penalty: 0.0
+
 # --- Qwen3.6-35B-A3B (UD-Q4_K_M ~14 GB) ---
 # MoE, 3.6B active params. Non-MTP (faster on single GPU per Unsloth benchmark).
 # Sampling: temp 0.6/1.0, top_p 0.95, top_k 20
@ -249,7 +245,40 @@
    min_p: 0.0
    presence-penalty: 0.0

-# --- Uncensored models (apply censored family params) ---
+# --- Qwen3.6-35B-A3B MTP variant ---
+- id: qwen36-35b-mtp-fast-64k
+  name: "Qwen3.6-35B MTP Fast 64K"
+  model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf"
+  flags:
+    n_ctx: 65536
+    n_gpu_layers: 999
+    cache-type-k: q8_0
+    cache-type-v: q8_0
+    flash-attn: on
+    temp: 0.6
+    top_p: 0.95
+    top_k: 20
+    repeat-penalty: 1.0
+    min_p: 0.0
+    presence-penalty: 0.0
+
+- id: qwen36-35b-mtp-extended-128k
+  name: "Qwen3.6-35B MTP Extended 128K"
+  model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf"
+  flags:
+    n_ctx: 131072
+    n_gpu_layers: 999
+    cache-type-k: q4_0
+    cache-type-v: q4_0
+    flash-attn: on
+    temp: 0.6
+    top_p: 0.95
+    top_k: 20
+    repeat-penalty: 1.0
+    min_p: 0.0
+    presence-penalty: 0.0
+
+# --- Uncensored models ---
 - id: qwen36-35b-hauhau-aggressive-64k
  name: "Qwen3.6-35B HauhauCS Aggressive 64K"
  model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf"
@ -282,6 +311,22 @@
    min_p: 0.0
    presence-penalty: 0.0

+- id: qwen36-35b-genesis-mtp-apex-64k
+  name: "Qwen3.6-35B Genesis MTP APEX 64K"
+  model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-Genesis-MTP-APEX.gguf"
+  flags:
+    n_ctx: 65536
+    n_gpu_layers: 999
+    cache-type-k: q8_0
+    cache-type-v: q8_0
+    flash-attn: on
+    temp: 0.6
+    top_p: 0.95
+    top_k: 20
+    repeat-penalty: 1.0
+    min_p: 0.0
+    presence-penalty: 0.0
+
 - id: gemma4-26b-hauhau-balanced-64k
  name: "Gemma4 26B HauhauCS Balanced 64K"
  model_path: "/home/bigt/AI/llm/gemma4/Gemma4-26B-A4B-Uncensored-HauhauCS-Balanced-Q5_K_M.gguf"