This repository was archived by the owner on Apr 20, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 37
Expand file tree
/
Copy pathsrtslurm.yaml.example
More file actions
66 lines (61 loc) · 2.57 KB
/
srtslurm.yaml.example
File metadata and controls
66 lines (61 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Example cluster configuration (srtslurm.yaml)
# Copy this to srtslurm.yaml and customize for your cluster
# This file is gitignored to keep cluster-specific settings private
#
# Discovery: srtctl searches cwd and up to 2 parent directories.
# For deep directory structures, set SRTSLURM_CONFIG in your shell:
# export SRTSLURM_CONFIG="/path/to/srt-slurm/srtslurm.yaml"
# SLURM defaults
default_account: "your-gpu-account"
default_partition: "gpu"
default_time_limit: "04:00:00"
# SLURM directive compatibility
# Set to false if your cluster doesn't support --gpus-per-node
use_gpus_per_node_directive: true # Default: true
# Set to false if your cluster doesn't support --segment for segment-based scheduling
use_segment_sbatch_directive: true # Default: true
# Set to true if your cluster requires --exclusive to be set.
use_exclusive_sbatch_directive: false # Default: false
# Container registry
default_container: "/shared/containers/sglang-latest.sqsh"
containers:
sglang-latest: "/shared/containers/sglang-v0.4.sqsh"
sglang-dev: "/shared/containers/sglang-dev.sqsh"
sglang-fp4: "/shared/containers/sglang-fp4.sqsh"
# Model path aliases
model_paths:
deepseek-r1: "/shared/models/deepseek/DeepSeek-R1"
deepseek-r1-distill: "/shared/models/deepseek/DeepSeek-R1-Distill-Qwen-32B"
llama-3-70b: "/shared/models/meta/llama-3-70b"
llama-3-405b: "/shared/models/meta/llama-3-405b"
# AI-powered failure analysis (optional)
# When enabled, Claude Code CLI analyzes benchmark failures and writes ai_analysis.md
# Uses OpenRouter for authentication (works well in headless/automated environments)
# See: https://openrouter.ai/docs/guides/guides/claude-code-integration
# ai_analysis:
# enabled: true
#
# # Authentication (required - can also use env vars OPENROUTER_API_KEY / GH_TOKEN)
# openrouter_api_key: "sk-or-v1-..."
# gh_token: "ghp_..."
#
# # GitHub repos to search for related PRs when analyzing failures
# repos_to_search:
# - sgl-project/sglang
# - ai-dynamo/dynamo
# pr_search_days: 14
#
# # Custom prompt template (optional - uses sensible default if omitted)
# # Available variables: {log_dir}, {repos}, {pr_days}
# # prompt: |
# # You are analyzing benchmark failure logs for an LLM serving system.
# # ...
# Usage:
# In your job config, you can now use:
# model:
# path: "deepseek-r1" # Resolves to /shared/models/deepseek/DeepSeek-R1
# container: "sglang-fp4" # Resolves to /shared/containers/sglang-fp4.sqsh
#
# And you can omit SLURM fields:
# slurm:
# time_limit: "02:00:00" # account and partition filled from defaults