co-mono/packages/pods/src/models.json

{
	"models": {
		"Qwen/Qwen2.5-Coder-32B-Instruct": {
			"name": "Qwen2.5-Coder-32B",
			"configs": [
				{
					"gpuCount": 1,
					"gpuTypes": ["H100", "H200"],
					"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
				},
				{
					"gpuCount": 2,
					"gpuTypes": ["H100", "H200"],
					"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
				}
			]
		},
		"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
			"name": "Qwen3-Coder-30B",
			"configs": [
				{
					"gpuCount": 1,
					"gpuTypes": ["H100", "H200"],
					"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
					"notes": "Fits comfortably on single GPU. ~60GB model weight."
				},
				{
					"gpuCount": 2,
					"gpuTypes": ["H100", "H200"],
					"args": [
						"--tensor-parallel-size",
						"2",
						"--enable-auto-tool-choice",
						"--tool-call-parser",
						"qwen3_coder"
					],
					"notes": "For higher throughput/longer context."
				}
			]
		},
		"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
			"name": "Qwen3-Coder-30B-FP8",
			"configs": [
				{
					"gpuCount": 1,
					"gpuTypes": ["H100", "H200"],
					"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
					"env": {
						"VLLM_USE_DEEP_GEMM": "1"
					},
					"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
				}
			]
		},
		"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
			"name": "Qwen3-Coder-480B",
			"configs": [
				{
					"gpuCount": 8,
					"gpuTypes": ["H200", "H20"],
					"args": [
						"--tensor-parallel-size",
						"8",
						"--max-model-len",
						"32000",
						"--enable-auto-tool-choice",
						"--tool-call-parser",
						"qwen3_coder"
					],
					"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
				}
			]
		},
		"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
			"name": "Qwen3-Coder-480B-FP8",
			"configs": [
				{
					"gpuCount": 8,
					"gpuTypes": ["H200", "H20"],
					"args": [
						"--max-model-len",
						"131072",
						"--enable-expert-parallel",
						"--data-parallel-size",
						"8",
						"--enable-auto-tool-choice",
						"--tool-call-parser",
						"qwen3_coder"
					],
					"env": {
						"VLLM_USE_DEEP_GEMM": "1"
					},
					"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
				}
			]
		},
		"openai/gpt-oss-20b": {
			"name": "GPT-OSS-20B",
			"configs": [
				{
					"gpuCount": 1,
					"gpuTypes": ["H100", "H200"],
					"args": ["--async-scheduling"]
				},
				{
					"gpuCount": 1,
					"gpuTypes": ["B200"],
					"args": ["--async-scheduling"],
					"env": {
						"VLLM_USE_TRTLLM_ATTENTION": "1",
						"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
						"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
						"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
					}
				}
			],
			"notes": "Tools/function calls only  via /v1/responses endpoint."
		},
		"openai/gpt-oss-120b": {
			"name": "GPT-OSS-120B",
			"configs": [
				{
					"gpuCount": 1,
					"gpuTypes": ["H100", "H200"],
					"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
					"notes": "Single GPU deployment. Tools/function calls only via /v1/responses endpoint."
				},
				{
					"gpuCount": 2,
					"gpuTypes": ["H100", "H200"],
					"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
					"notes": "Recommended for H100/H200. Tools/function calls only via /v1/responses endpoint."
				},
				{
					"gpuCount": 4,
					"gpuTypes": ["H100", "H200"],
					"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
					"notes": "Higher throughput. Tools/function calls only via /v1/responses endpoint."
				},
				{
					"gpuCount": 8,
					"gpuTypes": ["H100", "H200"],
					"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
					"notes": "Maximum throughput for evaluation workloads. Tools/function calls only via /v1/responses endpoint."
				}
			]
		},
		"zai-org/GLM-4.5": {
			"name": "GLM-4.5",
			"configs": [
				{
					"gpuCount": 16,
					"gpuTypes": ["H100"],
					"args": [
						"--tensor-parallel-size",
						"16",
						"--tool-call-parser",
						"glm45",
						"--reasoning-parser",
						"glm45",
						"--enable-auto-tool-choice"
					]
				},
				{
					"gpuCount": 8,
					"gpuTypes": ["H200"],
					"args": [
						"--tensor-parallel-size",
						"8",
						"--tool-call-parser",
						"glm45",
						"--reasoning-parser",
						"glm45",
						"--enable-auto-tool-choice"
					]
				}
			],
			"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
		},
		"zai-org/GLM-4.5-FP8": {
			"name": "GLM-4.5-FP8",
			"configs": [
				{
					"gpuCount": 8,
					"gpuTypes": ["H100"],
					"args": [
						"--tensor-parallel-size",
						"8",
						"--tool-call-parser",
						"glm45",
						"--reasoning-parser",
						"glm45",
						"--enable-auto-tool-choice"
					]
				},
				{
					"gpuCount": 4,
					"gpuTypes": ["H200"],
					"args": [
						"--tensor-parallel-size",
						"4",
						"--tool-call-parser",
						"glm45",
						"--reasoning-parser",
						"glm45",
						"--enable-auto-tool-choice"
					]
				}
			]
		},
		"zai-org/GLM-4.5-Air-FP8": {
			"name": "GLM-4.5-Air-FP8",
			"configs": [
				{
					"gpuCount": 2,
					"gpuTypes": ["H100"],
					"args": [
						"--tensor-parallel-size",
						"2",
						"--tool-call-parser",
						"glm45",
						"--reasoning-parser",
						"glm45",
						"--enable-auto-tool-choice"
					],
					"env": {
						"VLLM_ATTENTION_BACKEND": "XFORMERS"
					},
					"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
				},
				{
					"gpuCount": 1,
					"gpuTypes": ["H200"],
					"args": ["--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice"],
					"env": {
						"VLLM_ATTENTION_BACKEND": "XFORMERS"
					},
					"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
				}
			]
		},
		"zai-org/GLM-4.5-Air": {
			"name": "GLM-4.5-Air",
			"configs": [
				{
					"gpuCount": 2,
					"gpuTypes": ["H100", "H200"],
					"args": [
						"--tensor-parallel-size",
						"2",
						"--tool-call-parser",
						"glm45",
						"--reasoning-parser",
						"glm45",
						"--enable-auto-tool-choice"
					],
					"notes": "Non-quantized BF16 version, more compatible"
				},
				{
					"gpuCount": 1,
					"gpuTypes": ["H200"],
					"args": [
						"--tool-call-parser",
						"glm45",
						"--reasoning-parser",
						"glm45",
						"--enable-auto-tool-choice",
						"--gpu-memory-utilization",
						"0.95"
					],
					"notes": "Single H200 can fit the BF16 model with high memory utilization"
				}
			]
		},
		"moonshotai/Kimi-K2-Instruct": {
			"name": "Kimi-K2",
			"configs": [
				{
					"gpuCount": 16,
					"gpuTypes": ["H200", "H20"],
					"args": [
						"--tensor-parallel-size",
						"16",
						"--trust-remote-code",
						"--enable-auto-tool-choice",
						"--tool-call-parser",
						"kimi_k2"
					],
					"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
				}
			],
			"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
		}
	}
}