Initial monorepo setup with npm workspaces and dual TypeScript configuration

- Set up npm workspaces for three packages: pi-tui, pi-agent, and pi (pods) - Implemented dual TypeScript configuration: - Root tsconfig.json with path mappings for development and type checking - Package-specific tsconfig.build.json for clean production builds - Configured lockstep versioning with sync script for inter-package dependencies - Added comprehensive documentation for development and publishing workflows - All packages at version 0.5.0 ready for npm publishing
2026-04-20 05:04:44 +00:00 · 2025-08-09 17:18:38 +02:00 · 2025-08-09 17:18:38 +02:00 · a74c5da112
commit a74c5da112
63 changed files with 14558 additions and 0 deletions
--- a/packages/pods/src/models.json
+++ b/packages/pods/src/models.json
@ -0,0 +1,305 @@
+{
+	"models": {
+		"Qwen/Qwen2.5-Coder-32B-Instruct": {
+			"name": "Qwen2.5-Coder-32B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
+				},
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
+			"name": "Qwen3-Coder-30B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
+					"notes": "Fits comfortably on single GPU. ~60GB model weight."
+				},
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"2",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"qwen3_coder"
+					],
+					"notes": "For higher throughput/longer context."
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
+			"name": "Qwen3-Coder-30B-FP8",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
+					"env": {
+						"VLLM_USE_DEEP_GEMM": "1"
+					},
+					"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
+			"name": "Qwen3-Coder-480B",
+			"configs": [
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H200", "H20"],
+					"args": [
+						"--tensor-parallel-size",
+						"8",
+						"--max-model-len",
+						"32000",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"qwen3_coder"
+					],
+					"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
+			"name": "Qwen3-Coder-480B-FP8",
+			"configs": [
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H200", "H20"],
+					"args": [
+						"--max-model-len",
+						"131072",
+						"--enable-expert-parallel",
+						"--data-parallel-size",
+						"8",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"qwen3_coder"
+					],
+					"env": {
+						"VLLM_USE_DEEP_GEMM": "1"
+					},
+					"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
+				}
+			]
+		},
+		"openai/gpt-oss-20b": {
+			"name": "GPT-OSS-20B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--async-scheduling"]
+				},
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["B200"],
+					"args": ["--async-scheduling"],
+					"env": {
+						"VLLM_USE_TRTLLM_ATTENTION": "1",
+						"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
+						"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
+						"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
+					}
+				}
+			],
+			"notes": "Requires vLLM 0.10.1+gptoss. Tools/functoin calls only  via /v1/responses endpoint."
+		},
+		"openai/gpt-oss-120b": {
+			"name": "GPT-OSS-120B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
+					"notes": "Single GPU deployment. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				},
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
+					"notes": "Recommended for H100/H200. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				},
+				{
+					"gpuCount": 4,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
+					"notes": "Higher throughput. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				},
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
+					"notes": "Maximum throughput for evaluation workloads. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				}
+			]
+		},
+		"zai-org/GLM-4.5": {
+			"name": "GLM-4.5",
+			"configs": [
+				{
+					"gpuCount": 16,
+					"gpuTypes": ["H100"],
+					"args": [
+						"--tensor-parallel-size",
+						"16",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				},
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"8",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				}
+			],
+			"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
+		},
+		"zai-org/GLM-4.5-FP8": {
+			"name": "GLM-4.5-FP8",
+			"configs": [
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H100"],
+					"args": [
+						"--tensor-parallel-size",
+						"8",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				},
+				{
+					"gpuCount": 4,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"4",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				}
+			]
+		},
+		"zai-org/GLM-4.5-Air-FP8": {
+			"name": "GLM-4.5-Air-FP8",
+			"configs": [
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100"],
+					"args": [
+						"--tensor-parallel-size",
+						"2",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice",
+						"--quantization",
+						"fp8"
+					],
+					"env": {
+						"VLLM_ATTENTION_BACKEND": "XFORMERS"
+					},
+					"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
+				},
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice",
+						"--quantization",
+						"fp8"
+					],
+					"env": {
+						"VLLM_ATTENTION_BACKEND": "XFORMERS"
+					},
+					"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
+				}
+			]
+		},
+		"zai-org/GLM-4.5-Air": {
+			"name": "GLM-4.5-Air",
+			"configs": [
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"2",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					],
+					"notes": "Non-quantized BF16 version, more compatible"
+				},
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice",
+						"--gpu-memory-utilization",
+						"0.95"
+					],
+					"notes": "Single H200 can fit the BF16 model with high memory utilization"
+				}
+			]
+		},
+		"moonshotai/Kimi-K2-Instruct": {
+			"name": "Kimi-K2",
+			"configs": [
+				{
+					"gpuCount": 16,
+					"gpuTypes": ["H200", "H20"],
+					"args": [
+						"--tensor-parallel-size",
+						"16",
+						"--trust-remote-code",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"kimi_k2"
+					],
+					"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
+				}
+			],
+			"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
+		}
+	}
+}