mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-15 23:01:30 +00:00
295 lines
7.1 KiB
JSON
295 lines
7.1 KiB
JSON
{
|
|
"models": {
|
|
"Qwen/Qwen2.5-Coder-32B-Instruct": {
|
|
"name": "Qwen2.5-Coder-32B",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
|
},
|
|
{
|
|
"gpuCount": 2,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
|
}
|
|
]
|
|
},
|
|
"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
|
|
"name": "Qwen3-Coder-30B",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
|
"notes": "Fits comfortably on single GPU. ~60GB model weight."
|
|
},
|
|
{
|
|
"gpuCount": 2,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"2",
|
|
"--enable-auto-tool-choice",
|
|
"--tool-call-parser",
|
|
"qwen3_coder"
|
|
],
|
|
"notes": "For higher throughput/longer context."
|
|
}
|
|
]
|
|
},
|
|
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
|
|
"name": "Qwen3-Coder-30B-FP8",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
|
"env": {
|
|
"VLLM_USE_DEEP_GEMM": "1"
|
|
},
|
|
"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
|
|
}
|
|
]
|
|
},
|
|
"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
|
|
"name": "Qwen3-Coder-480B",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 8,
|
|
"gpuTypes": ["H200", "H20"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"8",
|
|
"--max-model-len",
|
|
"32000",
|
|
"--enable-auto-tool-choice",
|
|
"--tool-call-parser",
|
|
"qwen3_coder"
|
|
],
|
|
"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
|
|
}
|
|
]
|
|
},
|
|
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
|
|
"name": "Qwen3-Coder-480B-FP8",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 8,
|
|
"gpuTypes": ["H200", "H20"],
|
|
"args": [
|
|
"--max-model-len",
|
|
"131072",
|
|
"--enable-expert-parallel",
|
|
"--data-parallel-size",
|
|
"8",
|
|
"--enable-auto-tool-choice",
|
|
"--tool-call-parser",
|
|
"qwen3_coder"
|
|
],
|
|
"env": {
|
|
"VLLM_USE_DEEP_GEMM": "1"
|
|
},
|
|
"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
|
|
}
|
|
]
|
|
},
|
|
"openai/gpt-oss-20b": {
|
|
"name": "GPT-OSS-20B",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--async-scheduling"]
|
|
},
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["B200"],
|
|
"args": ["--async-scheduling"],
|
|
"env": {
|
|
"VLLM_USE_TRTLLM_ATTENTION": "1",
|
|
"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
|
|
"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
|
|
"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
|
|
}
|
|
}
|
|
],
|
|
"notes": "Tools/function calls only via /v1/responses endpoint."
|
|
},
|
|
"openai/gpt-oss-120b": {
|
|
"name": "GPT-OSS-120B",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
|
|
"notes": "Single GPU deployment. Tools/function calls only via /v1/responses endpoint."
|
|
},
|
|
{
|
|
"gpuCount": 2,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
|
|
"notes": "Recommended for H100/H200. Tools/function calls only via /v1/responses endpoint."
|
|
},
|
|
{
|
|
"gpuCount": 4,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
|
|
"notes": "Higher throughput. Tools/function calls only via /v1/responses endpoint."
|
|
},
|
|
{
|
|
"gpuCount": 8,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
|
|
"notes": "Maximum throughput for evaluation workloads. Tools/function calls only via /v1/responses endpoint."
|
|
}
|
|
]
|
|
},
|
|
"zai-org/GLM-4.5": {
|
|
"name": "GLM-4.5",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 16,
|
|
"gpuTypes": ["H100"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"16",
|
|
"--tool-call-parser",
|
|
"glm45",
|
|
"--reasoning-parser",
|
|
"glm45",
|
|
"--enable-auto-tool-choice"
|
|
]
|
|
},
|
|
{
|
|
"gpuCount": 8,
|
|
"gpuTypes": ["H200"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"8",
|
|
"--tool-call-parser",
|
|
"glm45",
|
|
"--reasoning-parser",
|
|
"glm45",
|
|
"--enable-auto-tool-choice"
|
|
]
|
|
}
|
|
],
|
|
"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
|
|
},
|
|
"zai-org/GLM-4.5-FP8": {
|
|
"name": "GLM-4.5-FP8",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 8,
|
|
"gpuTypes": ["H100"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"8",
|
|
"--tool-call-parser",
|
|
"glm45",
|
|
"--reasoning-parser",
|
|
"glm45",
|
|
"--enable-auto-tool-choice"
|
|
]
|
|
},
|
|
{
|
|
"gpuCount": 4,
|
|
"gpuTypes": ["H200"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"4",
|
|
"--tool-call-parser",
|
|
"glm45",
|
|
"--reasoning-parser",
|
|
"glm45",
|
|
"--enable-auto-tool-choice"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"zai-org/GLM-4.5-Air-FP8": {
|
|
"name": "GLM-4.5-Air-FP8",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 2,
|
|
"gpuTypes": ["H100"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"2",
|
|
"--tool-call-parser",
|
|
"glm45",
|
|
"--reasoning-parser",
|
|
"glm45",
|
|
"--enable-auto-tool-choice"
|
|
],
|
|
"env": {
|
|
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
|
},
|
|
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
|
},
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["H200"],
|
|
"args": ["--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice"],
|
|
"env": {
|
|
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
|
},
|
|
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
|
}
|
|
]
|
|
},
|
|
"zai-org/GLM-4.5-Air": {
|
|
"name": "GLM-4.5-Air",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 2,
|
|
"gpuTypes": ["H100", "H200"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"2",
|
|
"--tool-call-parser",
|
|
"glm45",
|
|
"--reasoning-parser",
|
|
"glm45",
|
|
"--enable-auto-tool-choice"
|
|
],
|
|
"notes": "Non-quantized BF16 version, more compatible"
|
|
},
|
|
{
|
|
"gpuCount": 1,
|
|
"gpuTypes": ["H200"],
|
|
"args": [
|
|
"--tool-call-parser",
|
|
"glm45",
|
|
"--reasoning-parser",
|
|
"glm45",
|
|
"--enable-auto-tool-choice",
|
|
"--gpu-memory-utilization",
|
|
"0.95"
|
|
],
|
|
"notes": "Single H200 can fit the BF16 model with high memory utilization"
|
|
}
|
|
]
|
|
},
|
|
"moonshotai/Kimi-K2-Instruct": {
|
|
"name": "Kimi-K2",
|
|
"configs": [
|
|
{
|
|
"gpuCount": 16,
|
|
"gpuTypes": ["H200", "H20"],
|
|
"args": [
|
|
"--tensor-parallel-size",
|
|
"16",
|
|
"--trust-remote-code",
|
|
"--enable-auto-tool-choice",
|
|
"--tool-call-parser",
|
|
"kimi_k2"
|
|
],
|
|
"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
|
|
}
|
|
],
|
|
"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
|
|
}
|
|
}
|
|
}
|