Initial monorepo setup with npm workspaces and dual TypeScript configuration

- Set up npm workspaces for three packages: pi-tui, pi-agent, and pi (pods)
- Implemented dual TypeScript configuration:
  - Root tsconfig.json with path mappings for development and type checking
  - Package-specific tsconfig.build.json for clean production builds
- Configured lockstep versioning with sync script for inter-package dependencies
- Added comprehensive documentation for development and publishing workflows
- All packages at version 0.5.0 ready for npm publishing
This commit is contained in:
Mario Zechner 2025-08-09 17:18:38 +02:00
commit a74c5da112
63 changed files with 14558 additions and 0 deletions

View file

@ -0,0 +1,305 @@
{
"models": {
"Qwen/Qwen2.5-Coder-32B-Instruct": {
"name": "Qwen2.5-Coder-32B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
},
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
}
]
},
"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
"name": "Qwen3-Coder-30B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
"notes": "Fits comfortably on single GPU. ~60GB model weight."
},
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": [
"--tensor-parallel-size",
"2",
"--enable-auto-tool-choice",
"--tool-call-parser",
"qwen3_coder"
],
"notes": "For higher throughput/longer context."
}
]
},
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
"name": "Qwen3-Coder-30B-FP8",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
"env": {
"VLLM_USE_DEEP_GEMM": "1"
},
"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
}
]
},
"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
"name": "Qwen3-Coder-480B",
"configs": [
{
"gpuCount": 8,
"gpuTypes": ["H200", "H20"],
"args": [
"--tensor-parallel-size",
"8",
"--max-model-len",
"32000",
"--enable-auto-tool-choice",
"--tool-call-parser",
"qwen3_coder"
],
"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
}
]
},
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
"name": "Qwen3-Coder-480B-FP8",
"configs": [
{
"gpuCount": 8,
"gpuTypes": ["H200", "H20"],
"args": [
"--max-model-len",
"131072",
"--enable-expert-parallel",
"--data-parallel-size",
"8",
"--enable-auto-tool-choice",
"--tool-call-parser",
"qwen3_coder"
],
"env": {
"VLLM_USE_DEEP_GEMM": "1"
},
"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
}
]
},
"openai/gpt-oss-20b": {
"name": "GPT-OSS-20B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--async-scheduling"]
},
{
"gpuCount": 1,
"gpuTypes": ["B200"],
"args": ["--async-scheduling"],
"env": {
"VLLM_USE_TRTLLM_ATTENTION": "1",
"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
}
}
],
"notes": "Requires vLLM 0.10.1+gptoss. Tools/functoin calls only via /v1/responses endpoint."
},
"openai/gpt-oss-120b": {
"name": "GPT-OSS-120B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
"notes": "Single GPU deployment. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
},
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
"notes": "Recommended for H100/H200. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
},
{
"gpuCount": 4,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
"notes": "Higher throughput. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
},
{
"gpuCount": 8,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
"notes": "Maximum throughput for evaluation workloads. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
}
]
},
"zai-org/GLM-4.5": {
"name": "GLM-4.5",
"configs": [
{
"gpuCount": 16,
"gpuTypes": ["H100"],
"args": [
"--tensor-parallel-size",
"16",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
},
{
"gpuCount": 8,
"gpuTypes": ["H200"],
"args": [
"--tensor-parallel-size",
"8",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
}
],
"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
},
"zai-org/GLM-4.5-FP8": {
"name": "GLM-4.5-FP8",
"configs": [
{
"gpuCount": 8,
"gpuTypes": ["H100"],
"args": [
"--tensor-parallel-size",
"8",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
},
{
"gpuCount": 4,
"gpuTypes": ["H200"],
"args": [
"--tensor-parallel-size",
"4",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
}
]
},
"zai-org/GLM-4.5-Air-FP8": {
"name": "GLM-4.5-Air-FP8",
"configs": [
{
"gpuCount": 2,
"gpuTypes": ["H100"],
"args": [
"--tensor-parallel-size",
"2",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice",
"--quantization",
"fp8"
],
"env": {
"VLLM_ATTENTION_BACKEND": "XFORMERS"
},
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
},
{
"gpuCount": 1,
"gpuTypes": ["H200"],
"args": [
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice",
"--quantization",
"fp8"
],
"env": {
"VLLM_ATTENTION_BACKEND": "XFORMERS"
},
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
}
]
},
"zai-org/GLM-4.5-Air": {
"name": "GLM-4.5-Air",
"configs": [
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": [
"--tensor-parallel-size",
"2",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
],
"notes": "Non-quantized BF16 version, more compatible"
},
{
"gpuCount": 1,
"gpuTypes": ["H200"],
"args": [
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice",
"--gpu-memory-utilization",
"0.95"
],
"notes": "Single H200 can fit the BF16 model with high memory utilization"
}
]
},
"moonshotai/Kimi-K2-Instruct": {
"name": "Kimi-K2",
"configs": [
{
"gpuCount": 16,
"gpuTypes": ["H200", "H20"],
"args": [
"--tensor-parallel-size",
"16",
"--trust-remote-code",
"--enable-auto-tool-choice",
"--tool-call-parser",
"kimi_k2"
],
"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
}
],
"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
}
}
}