mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-20 05:04:44 +00:00
Initial monorepo setup with npm workspaces and dual TypeScript configuration
- Set up npm workspaces for three packages: pi-tui, pi-agent, and pi (pods) - Implemented dual TypeScript configuration: - Root tsconfig.json with path mappings for development and type checking - Package-specific tsconfig.build.json for clean production builds - Configured lockstep versioning with sync script for inter-package dependencies - Added comprehensive documentation for development and publishing workflows - All packages at version 0.5.0 ready for npm publishing
This commit is contained in:
commit
a74c5da112
63 changed files with 14558 additions and 0 deletions
305
packages/pods/src/models.json
Normal file
305
packages/pods/src/models.json
Normal file
|
|
@ -0,0 +1,305 @@
|
|||
{
|
||||
"models": {
|
||||
"Qwen/Qwen2.5-Coder-32B-Instruct": {
|
||||
"name": "Qwen2.5-Coder-32B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
||||
},
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
|
||||
"name": "Qwen3-Coder-30B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
||||
"notes": "Fits comfortably on single GPU. ~60GB model weight."
|
||||
},
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"2",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"qwen3_coder"
|
||||
],
|
||||
"notes": "For higher throughput/longer context."
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
|
||||
"name": "Qwen3-Coder-30B-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
||||
"env": {
|
||||
"VLLM_USE_DEEP_GEMM": "1"
|
||||
},
|
||||
"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
|
||||
"name": "Qwen3-Coder-480B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H200", "H20"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"8",
|
||||
"--max-model-len",
|
||||
"32000",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"qwen3_coder"
|
||||
],
|
||||
"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
|
||||
"name": "Qwen3-Coder-480B-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H200", "H20"],
|
||||
"args": [
|
||||
"--max-model-len",
|
||||
"131072",
|
||||
"--enable-expert-parallel",
|
||||
"--data-parallel-size",
|
||||
"8",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"qwen3_coder"
|
||||
],
|
||||
"env": {
|
||||
"VLLM_USE_DEEP_GEMM": "1"
|
||||
},
|
||||
"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
|
||||
}
|
||||
]
|
||||
},
|
||||
"openai/gpt-oss-20b": {
|
||||
"name": "GPT-OSS-20B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--async-scheduling"]
|
||||
},
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["B200"],
|
||||
"args": ["--async-scheduling"],
|
||||
"env": {
|
||||
"VLLM_USE_TRTLLM_ATTENTION": "1",
|
||||
"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
|
||||
"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
|
||||
"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"notes": "Requires vLLM 0.10.1+gptoss. Tools/functoin calls only via /v1/responses endpoint."
|
||||
},
|
||||
"openai/gpt-oss-120b": {
|
||||
"name": "GPT-OSS-120B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
|
||||
"notes": "Single GPU deployment. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
},
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
|
||||
"notes": "Recommended for H100/H200. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
},
|
||||
{
|
||||
"gpuCount": 4,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
|
||||
"notes": "Higher throughput. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
},
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
|
||||
"notes": "Maximum throughput for evaluation workloads. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
}
|
||||
]
|
||||
},
|
||||
"zai-org/GLM-4.5": {
|
||||
"name": "GLM-4.5",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 16,
|
||||
"gpuTypes": ["H100"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"16",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"8",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
}
|
||||
],
|
||||
"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
|
||||
},
|
||||
"zai-org/GLM-4.5-FP8": {
|
||||
"name": "GLM-4.5-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H100"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"8",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"gpuCount": 4,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"4",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"zai-org/GLM-4.5-Air-FP8": {
|
||||
"name": "GLM-4.5-Air-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"2",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice",
|
||||
"--quantization",
|
||||
"fp8"
|
||||
],
|
||||
"env": {
|
||||
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
||||
},
|
||||
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
||||
},
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice",
|
||||
"--quantization",
|
||||
"fp8"
|
||||
],
|
||||
"env": {
|
||||
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
||||
},
|
||||
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
||||
}
|
||||
]
|
||||
},
|
||||
"zai-org/GLM-4.5-Air": {
|
||||
"name": "GLM-4.5-Air",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"2",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
],
|
||||
"notes": "Non-quantized BF16 version, more compatible"
|
||||
},
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice",
|
||||
"--gpu-memory-utilization",
|
||||
"0.95"
|
||||
],
|
||||
"notes": "Single H200 can fit the BF16 model with high memory utilization"
|
||||
}
|
||||
]
|
||||
},
|
||||
"moonshotai/Kimi-K2-Instruct": {
|
||||
"name": "Kimi-K2",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 16,
|
||||
"gpuTypes": ["H200", "H20"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"16",
|
||||
"--trust-remote-code",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"kimi_k2"
|
||||
],
|
||||
"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
|
||||
}
|
||||
],
|
||||
"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue