Skip to content

Commit 3a8428e

Browse files
authored
[router] Expose worker startup interval (#3019)
1 parent 0311ce8 commit 3a8428e

File tree

5 files changed

+72
-13
lines changed

5 files changed

+72
-13
lines changed

sgl-router/py_src/sglang_router/launch_router.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class RouterArgs:
3434
# Routing policy
3535
policy: str = "cache_aware"
3636
worker_startup_timeout_secs: int = 300
37+
worker_startup_check_interval: int = 10
3738
cache_threshold: float = 0.5
3839
balance_abs_threshold: int = 32
3940
balance_rel_threshold: float = 1.0001
@@ -94,6 +95,12 @@ def add_cli_args(
9495
default=RouterArgs.worker_startup_timeout_secs,
9596
help="Timeout in seconds for worker startup",
9697
)
98+
parser.add_argument(
99+
f"--{prefix}worker-startup-check-interval",
100+
type=int,
101+
default=RouterArgs.worker_startup_check_interval,
102+
help="Interval in seconds between checks for worker startup",
103+
)
97104
parser.add_argument(
98105
f"--{prefix}cache-threshold",
99106
type=float,
@@ -157,6 +164,9 @@ def from_cli_args(
157164
worker_startup_timeout_secs=getattr(
158165
args, f"{prefix}worker_startup_timeout_secs"
159166
),
167+
worker_startup_check_interval=getattr(
168+
args, f"{prefix}worker_startup_check_interval"
169+
),
160170
cache_threshold=getattr(args, f"{prefix}cache_threshold"),
161171
balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
162172
balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
@@ -202,6 +212,7 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
202212
port=router_args.port,
203213
policy=policy_from_str(router_args.policy),
204214
worker_startup_timeout_secs=router_args.worker_startup_timeout_secs,
215+
worker_startup_check_interval=router_args.worker_startup_check_interval,
205216
cache_threshold=router_args.cache_threshold,
206217
balance_abs_threshold=router_args.balance_abs_threshold,
207218
balance_rel_threshold=router_args.balance_rel_threshold,

sgl-router/py_src/sglang_router/router.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class Router:
1818
host: Host address to bind the router server. Default: '127.0.0.1'
1919
port: Port number to bind the router server. Default: 3001
2020
worker_startup_timeout_secs: Timeout in seconds for worker startup. Default: 300
21+
worker_startup_check_interval: Interval in seconds between checks for worker initialization. Default: 10
2122
cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
2223
if the match rate exceeds threshold, otherwise routes to the worker with the smallest
2324
tree. Default: 0.5
@@ -39,6 +40,7 @@ def __init__(
3940
host: str = "127.0.0.1",
4041
port: int = 3001,
4142
worker_startup_timeout_secs: int = 300,
43+
worker_startup_check_interval: int = 10,
4244
cache_threshold: float = 0.50,
4345
balance_abs_threshold: int = 32,
4446
balance_rel_threshold: float = 1.0001,
@@ -53,6 +55,7 @@ def __init__(
5355
host=host,
5456
port=port,
5557
worker_startup_timeout_secs=worker_startup_timeout_secs,
58+
worker_startup_check_interval=worker_startup_check_interval,
5659
cache_threshold=cache_threshold,
5760
balance_abs_threshold=balance_abs_threshold,
5861
balance_rel_threshold=balance_rel_threshold,

sgl-router/py_test/test_launch_router.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def setUp(self):
2929
port=30000,
3030
policy="cache_aware",
3131
worker_startup_timeout_secs=600,
32+
worker_startup_check_interval=10,
3233
cache_threshold=0.5,
3334
balance_abs_threshold=32,
3435
balance_rel_threshold=1.0001,

sgl-router/src/lib.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct Router {
1818
worker_urls: Vec<String>,
1919
policy: PolicyType,
2020
worker_startup_timeout_secs: u64,
21+
worker_startup_check_interval: u64,
2122
cache_threshold: f32,
2223
balance_abs_threshold: usize,
2324
balance_rel_threshold: f32,
@@ -36,6 +37,7 @@ impl Router {
3637
host = String::from("127.0.0.1"),
3738
port = 3001,
3839
worker_startup_timeout_secs = 300,
40+
worker_startup_check_interval = 10,
3941
cache_threshold = 0.50,
4042
balance_abs_threshold = 32,
4143
balance_rel_threshold = 1.0001,
@@ -50,6 +52,7 @@ impl Router {
5052
host: String,
5153
port: u16,
5254
worker_startup_timeout_secs: u64,
55+
worker_startup_check_interval: u64,
5356
cache_threshold: f32,
5457
balance_abs_threshold: usize,
5558
balance_rel_threshold: f32,
@@ -64,6 +67,7 @@ impl Router {
6467
worker_urls,
6568
policy,
6669
worker_startup_timeout_secs,
70+
worker_startup_check_interval,
6771
cache_threshold,
6872
balance_abs_threshold,
6973
balance_rel_threshold,
@@ -78,12 +82,15 @@ impl Router {
7882
let policy_config = match &self.policy {
7983
PolicyType::Random => router::PolicyConfig::RandomConfig {
8084
timeout_secs: self.worker_startup_timeout_secs,
85+
interval_secs: self.worker_startup_check_interval,
8186
},
8287
PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig {
8388
timeout_secs: self.worker_startup_timeout_secs,
89+
interval_secs: self.worker_startup_check_interval,
8490
},
8591
PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
8692
timeout_secs: self.worker_startup_timeout_secs,
93+
interval_secs: self.worker_startup_check_interval,
8794
cache_threshold: self.cache_threshold,
8895
balance_abs_threshold: self.balance_abs_threshold,
8996
balance_rel_threshold: self.balance_rel_threshold,

sgl-router/src/router.rs

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ pub enum Router {
1818
worker_urls: Arc<RwLock<Vec<String>>>,
1919
current_index: AtomicUsize,
2020
timeout_secs: u64,
21+
interval_secs: u64,
2122
},
2223
Random {
2324
worker_urls: Arc<RwLock<Vec<String>>>,
2425
timeout_secs: u64,
26+
interval_secs: u64,
2527
},
2628
CacheAware {
2729
/*
@@ -92,6 +94,7 @@ pub enum Router {
9294
balance_abs_threshold: usize,
9395
balance_rel_threshold: f32,
9496
timeout_secs: u64,
97+
interval_secs: u64,
9598
_eviction_thread: Option<thread::JoinHandle<()>>,
9699
},
97100
}
@@ -100,9 +103,11 @@ pub enum Router {
100103
pub enum PolicyConfig {
101104
RandomConfig {
102105
timeout_secs: u64,
106+
interval_secs: u64,
103107
},
104108
RoundRobinConfig {
105109
timeout_secs: u64,
110+
interval_secs: u64,
106111
},
107112
CacheAwareConfig {
108113
cache_threshold: f32,
@@ -111,31 +116,50 @@ pub enum PolicyConfig {
111116
eviction_interval_secs: u64,
112117
max_tree_size: usize,
113118
timeout_secs: u64,
119+
interval_secs: u64,
114120
},
115121
}
116122

117123
impl Router {
118124
pub fn new(worker_urls: Vec<String>, policy_config: PolicyConfig) -> Result<Self, String> {
119-
// Get timeout from policy config
120-
let timeout_secs = match &policy_config {
121-
PolicyConfig::RandomConfig { timeout_secs } => *timeout_secs,
122-
PolicyConfig::RoundRobinConfig { timeout_secs } => *timeout_secs,
123-
PolicyConfig::CacheAwareConfig { timeout_secs, .. } => *timeout_secs,
125+
// Get timeout and interval from policy config
126+
let (timeout_secs, interval_secs) = match &policy_config {
127+
PolicyConfig::RandomConfig {
128+
timeout_secs,
129+
interval_secs,
130+
} => (*timeout_secs, *interval_secs),
131+
PolicyConfig::RoundRobinConfig {
132+
timeout_secs,
133+
interval_secs,
134+
} => (*timeout_secs, *interval_secs),
135+
PolicyConfig::CacheAwareConfig {
136+
timeout_secs,
137+
interval_secs,
138+
..
139+
} => (*timeout_secs, *interval_secs),
124140
};
125141

126142
// Wait until all workers are healthy
127-
Self::wait_for_healthy_workers(&worker_urls, timeout_secs, 10)?;
143+
Self::wait_for_healthy_workers(&worker_urls, timeout_secs, interval_secs)?;
128144

129145
// Create router based on policy...
130146
Ok(match policy_config {
131-
PolicyConfig::RandomConfig { timeout_secs } => Router::Random {
147+
PolicyConfig::RandomConfig {
148+
timeout_secs,
149+
interval_secs,
150+
} => Router::Random {
132151
worker_urls: Arc::new(RwLock::new(worker_urls)),
133152
timeout_secs,
153+
interval_secs,
134154
},
135-
PolicyConfig::RoundRobinConfig { timeout_secs } => Router::RoundRobin {
155+
PolicyConfig::RoundRobinConfig {
156+
timeout_secs,
157+
interval_secs,
158+
} => Router::RoundRobin {
136159
worker_urls: Arc::new(RwLock::new(worker_urls)),
137160
current_index: std::sync::atomic::AtomicUsize::new(0),
138161
timeout_secs,
162+
interval_secs,
139163
},
140164
PolicyConfig::CacheAwareConfig {
141165
cache_threshold,
@@ -144,6 +168,7 @@ impl Router {
144168
eviction_interval_secs,
145169
max_tree_size,
146170
timeout_secs,
171+
interval_secs,
147172
} => {
148173
let mut running_queue = HashMap::new();
149174
for url in &worker_urls {
@@ -195,6 +220,7 @@ impl Router {
195220
balance_abs_threshold,
196221
balance_rel_threshold,
197222
timeout_secs,
223+
interval_secs,
198224
_eviction_thread: Some(eviction_thread),
199225
}
200226
}
@@ -594,11 +620,22 @@ impl Router {
594620
}
595621

596622
pub async fn add_worker(&self, worker_url: &str) -> Result<String, String> {
597-
let interval_secs = 10; // check every 10 seconds
598-
let timeout_secs = match self {
599-
Router::Random { timeout_secs, .. } => *timeout_secs,
600-
Router::RoundRobin { timeout_secs, .. } => *timeout_secs,
601-
Router::CacheAware { timeout_secs, .. } => *timeout_secs,
623+
let (timeout_secs, interval_secs) = match self {
624+
Router::Random {
625+
timeout_secs,
626+
interval_secs,
627+
..
628+
} => (*timeout_secs, *interval_secs),
629+
Router::RoundRobin {
630+
timeout_secs,
631+
interval_secs,
632+
..
633+
} => (*timeout_secs, *interval_secs),
634+
Router::CacheAware {
635+
timeout_secs,
636+
interval_secs,
637+
..
638+
} => (*timeout_secs, *interval_secs),
602639
};
603640

604641
let start_time = std::time::Instant::now();

0 commit comments

Comments
 (0)