{ "models": { "Qwen/Qwen2.5-Coder-32B-Instruct": { "name": "Qwen2.5-Coder-32B", "configs": [ { "gpuCount": 1, "gpuTypes": ["H100", "H200"], "args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"] }, { "gpuCount": 2, "gpuTypes": ["H100", "H200"], "args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"] } ] }, "Qwen/Qwen3-Coder-30B-A3B-Instruct": { "name": "Qwen3-Coder-30B", "configs": [ { "gpuCount": 1, "gpuTypes": ["H100", "H200"], "args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"], "notes": "Fits comfortably on single GPU. ~60GB model weight." }, { "gpuCount": 2, "gpuTypes": ["H100", "H200"], "args": [ "--tensor-parallel-size", "2", "--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder" ], "notes": "For higher throughput/longer context." } ] }, "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": { "name": "Qwen3-Coder-30B-FP8", "configs": [ { "gpuCount": 1, "gpuTypes": ["H100", "H200"], "args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"], "env": { "VLLM_USE_DEEP_GEMM": "1" }, "notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment." } ] }, "Qwen/Qwen3-Coder-480B-A35B-Instruct": { "name": "Qwen3-Coder-480B", "configs": [ { "gpuCount": 8, "gpuTypes": ["H200", "H20"], "args": [ "--tensor-parallel-size", "8", "--max-model-len", "32000", "--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder" ], "notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization." } ] }, "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": { "name": "Qwen3-Coder-480B-FP8", "configs": [ { "gpuCount": 8, "gpuTypes": ["H200", "H20"], "args": [ "--max-model-len", "131072", "--enable-expert-parallel", "--data-parallel-size", "8", "--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder" ], "env": { "VLLM_USE_DEEP_GEMM": "1" }, "notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors." } ] }, "openai/gpt-oss-20b": { "name": "GPT-OSS-20B", "configs": [ { "gpuCount": 1, "gpuTypes": ["H100", "H200"], "args": ["--async-scheduling"] }, { "gpuCount": 1, "gpuTypes": ["B200"], "args": ["--async-scheduling"], "env": { "VLLM_USE_TRTLLM_ATTENTION": "1", "VLLM_USE_TRTLLM_DECODE_ATTENTION": "1", "VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1", "VLLM_USE_FLASHINFER_MXFP4_MOE": "1" } } ], "notes": "Tools/function calls only via /v1/responses endpoint." }, "openai/gpt-oss-120b": { "name": "GPT-OSS-120B", "configs": [ { "gpuCount": 1, "gpuTypes": ["H100", "H200"], "args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"], "notes": "Single GPU deployment. Tools/function calls only via /v1/responses endpoint." }, { "gpuCount": 2, "gpuTypes": ["H100", "H200"], "args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"], "notes": "Recommended for H100/H200. Tools/function calls only via /v1/responses endpoint." }, { "gpuCount": 4, "gpuTypes": ["H100", "H200"], "args": ["--tensor-parallel-size", "4", "--async-scheduling"], "notes": "Higher throughput. Tools/function calls only via /v1/responses endpoint." }, { "gpuCount": 8, "gpuTypes": ["H100", "H200"], "args": ["--tensor-parallel-size", "8", "--async-scheduling"], "notes": "Maximum throughput for evaluation workloads. Tools/function calls only via /v1/responses endpoint." } ] }, "zai-org/GLM-4.5": { "name": "GLM-4.5", "configs": [ { "gpuCount": 16, "gpuTypes": ["H100"], "args": [ "--tensor-parallel-size", "16", "--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice" ] }, { "gpuCount": 8, "gpuTypes": ["H200"], "args": [ "--tensor-parallel-size", "8", "--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice" ] } ], "notes": "Models default to thinking mode. For full 128K context, double the GPU count." }, "zai-org/GLM-4.5-FP8": { "name": "GLM-4.5-FP8", "configs": [ { "gpuCount": 8, "gpuTypes": ["H100"], "args": [ "--tensor-parallel-size", "8", "--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice" ] }, { "gpuCount": 4, "gpuTypes": ["H200"], "args": [ "--tensor-parallel-size", "4", "--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice" ] } ] }, "zai-org/GLM-4.5-Air-FP8": { "name": "GLM-4.5-Air-FP8", "configs": [ { "gpuCount": 2, "gpuTypes": ["H100"], "args": [ "--tensor-parallel-size", "2", "--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice" ], "env": { "VLLM_ATTENTION_BACKEND": "XFORMERS" }, "notes": "FP8 model requires vLLM with proper FP8 support or MTP module" }, { "gpuCount": 1, "gpuTypes": ["H200"], "args": ["--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice"], "env": { "VLLM_ATTENTION_BACKEND": "XFORMERS" }, "notes": "FP8 model requires vLLM with proper FP8 support or MTP module" } ] }, "zai-org/GLM-4.5-Air": { "name": "GLM-4.5-Air", "configs": [ { "gpuCount": 2, "gpuTypes": ["H100", "H200"], "args": [ "--tensor-parallel-size", "2", "--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice" ], "notes": "Non-quantized BF16 version, more compatible" }, { "gpuCount": 1, "gpuTypes": ["H200"], "args": [ "--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice", "--gpu-memory-utilization", "0.95" ], "notes": "Single H200 can fit the BF16 model with high memory utilization" } ] }, "moonshotai/Kimi-K2-Instruct": { "name": "Kimi-K2", "configs": [ { "gpuCount": 16, "gpuTypes": ["H200", "H20"], "args": [ "--tensor-parallel-size", "16", "--trust-remote-code", "--enable-auto-tool-choice", "--tool-call-parser", "kimi_k2" ], "notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism." } ], "notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context." } } }