diff --git a/deploy/manifest.yaml b/deploy/manifest.yaml index fa6117a..8255afe 100644 --- a/deploy/manifest.yaml +++ b/deploy/manifest.yaml @@ -11,27 +11,6 @@ # All profiles use flash-attn: on, n-gpu-layers: 999 (offload all) # KV cache quantization (q8_0/q4_0) enables 64K+ context within 24GB VRAM -- id: qwen-3-8b - name: "Qwen 3 8B" - model_path: "/home/bigt/AI/llm/qwen/qwen3-8b-q4.gguf" - flags: - n_ctx: 8192 - n_gpu_layers: 35 - -- id: qwen-3-8b-long - name: "Qwen 3 8B (Long Context)" - model_path: "/home/bigt/AI/llm/qwen/qwen3-8b-q4.gguf" - flags: - n_ctx: 32768 - n_gpu_layers: 20 - -- id: llama-4-maverick - name: "Llama 4 Maverick" - model_path: "/home/bigt/AI/llm/llama4/llama4-maverick-q4.gguf" - flags: - n_ctx: 8192 - n_gpu_layers: 35 - # --- Qwen3.6-27B (Q4_K_M, ~10.5 GB) --- # Sampling: temp 0.6/1.0, top_p 0.95, top_k 20 - id: qwen36-27b-balanced-64k @@ -82,7 +61,7 @@ min_p: 0.0 presence-penalty: 0.0 -# --- Gemma 4 12B (Q6_K_XL ~8.5 GB, IQ4_XS ~5 GB) --- +# --- Gemma 4 12B (Q6_K_XL ~8.5 GB) --- # Sampling: temp 1.0, top_p 0.95, top_k 64 (Google official) - id: gemma4-12b-standard-q6-64k name: "Gemma4 12B Standard Q6 64K" @@ -116,38 +95,6 @@ min_p: 0.0 presence-penalty: 0.0 -- id: gemma4-12b-compact-iq4-64k - name: "Gemma4 12B Compact IQ4 64K" - model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf" - flags: - n_ctx: 65536 - n_gpu_layers: 999 - cache-type-k: q8_0 - cache-type-v: q8_0 - flash-attn: on - temp: 1.0 - top_p: 0.95 - top_k: 64 - repeat-penalty: 1.0 - min_p: 0.0 - presence-penalty: 0.0 - -- id: gemma4-12b-compact-long-128k - name: "Gemma4 12B Compact IQ4 128K" - model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf" - flags: - n_ctx: 131072 - n_gpu_layers: 999 - cache-type-k: q8_0 - cache-type-v: q8_0 - flash-attn: on - temp: 1.0 - top_p: 0.95 - top_k: 64 - repeat-penalty: 1.0 - min_p: 0.0 - presence-penalty: 0.0 - # --- Gemma 4 26B-A4B (Q4_K_M ~10.5 GB, IQ4_XS ~6 GB) --- # MoE, 4B active params. Same sampling as 12B family. - id: gemma4-26b-balanced-64k @@ -198,6 +145,55 @@ min_p: 0.0 presence-penalty: 0.0 +- id: gemma4-26b-q5-64k + name: "Gemma4 26B Q5 64K" + model_path: "/home/bigt/AI/llm/gemma4/google_gemma-4-26B-A4B-it-Q5_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +# --- Gemma 4 26B Compact (IQ4_XS ~6 GB) --- +- id: gemma4-26b-compact-iq4-64k + name: "Gemma4 26B Compact IQ4 64K" + model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: gemma4-26b-compact-long-128k + name: "Gemma4 26B Compact IQ4 128K" + model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + # --- Qwen3.6-35B-A3B (UD-Q4_K_M ~14 GB) --- # MoE, 3.6B active params. Non-MTP (faster on single GPU per Unsloth benchmark). # Sampling: temp 0.6/1.0, top_p 0.95, top_k 20 @@ -249,7 +245,40 @@ min_p: 0.0 presence-penalty: 0.0 -# --- Uncensored models (apply censored family params) --- +# --- Qwen3.6-35B-A3B MTP variant --- +- id: qwen36-35b-mtp-fast-64k + name: "Qwen3.6-35B MTP Fast 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: qwen36-35b-mtp-extended-128k + name: "Qwen3.6-35B MTP Extended 128K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q4_0 + cache-type-v: q4_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +# --- Uncensored models --- - id: qwen36-35b-hauhau-aggressive-64k name: "Qwen3.6-35B HauhauCS Aggressive 64K" model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf" @@ -282,6 +311,22 @@ min_p: 0.0 presence-penalty: 0.0 +- id: qwen36-35b-genesis-mtp-apex-64k + name: "Qwen3.6-35B Genesis MTP APEX 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-Genesis-MTP-APEX.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + - id: gemma4-26b-hauhau-balanced-64k name: "Gemma4 26B HauhauCS Balanced 64K" model_path: "/home/bigt/AI/llm/gemma4/Gemma4-26B-A4B-Uncensored-HauhauCS-Balanced-Q5_K_M.gguf" @@ -296,4 +341,4 @@ top_k: 64 repeat-penalty: 1.0 min_p: 0.0 - presence-penalty: 0.0 + presence-penalty: 0.0 \ No newline at end of file