Skip to content

Commit 0311ce8

Browse files
authored
[router] Expose worker startup secs & Return error instead of panic for router init (#3016)
1 parent 5dfcacf commit 0311ce8

File tree

7 files changed

+124
-47
lines changed

7 files changed

+124
-47
lines changed

sgl-router/py_src/sglang_router/launch_router.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class RouterArgs:
3333

3434
# Routing policy
3535
policy: str = "cache_aware"
36+
worker_startup_timeout_secs: int = 300
3637
cache_threshold: float = 0.5
3738
balance_abs_threshold: int = 32
3839
balance_rel_threshold: float = 1.0001
@@ -87,6 +88,12 @@ def add_cli_args(
8788
choices=["random", "round_robin", "cache_aware"],
8889
help="Load balancing policy to use",
8990
)
91+
parser.add_argument(
92+
f"--{prefix}worker-startup-timeout-secs",
93+
type=int,
94+
default=RouterArgs.worker_startup_timeout_secs,
95+
help="Timeout in seconds for worker startup",
96+
)
9097
parser.add_argument(
9198
f"--{prefix}cache-threshold",
9299
type=float,
@@ -147,6 +154,9 @@ def from_cli_args(
147154
host=args.host,
148155
port=args.port,
149156
policy=getattr(args, f"{prefix}policy"),
157+
worker_startup_timeout_secs=getattr(
158+
args, f"{prefix}worker_startup_timeout_secs"
159+
),
150160
cache_threshold=getattr(args, f"{prefix}cache_threshold"),
151161
balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
152162
balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
@@ -188,9 +198,10 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
188198

189199
router = Router(
190200
worker_urls=router_args.worker_urls,
191-
policy=policy_from_str(router_args.policy),
192201
host=router_args.host,
193202
port=router_args.port,
203+
policy=policy_from_str(router_args.policy),
204+
worker_startup_timeout_secs=router_args.worker_startup_timeout_secs,
194205
cache_threshold=router_args.cache_threshold,
195206
balance_abs_threshold=router_args.balance_abs_threshold,
196207
balance_rel_threshold=router_args.balance_rel_threshold,
@@ -205,7 +216,7 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
205216

206217
except Exception as e:
207218
logger.error(f"Error starting router: {e}")
208-
return None
219+
raise e
209220

210221

211222
class CustomHelpFormatter(
@@ -239,10 +250,7 @@ def parse_router_args(args: List[str]) -> RouterArgs:
239250

240251
def main() -> None:
241252
router_args = parse_router_args(sys.argv[1:])
242-
router = launch_router(router_args)
243-
244-
if router is None:
245-
sys.exit(1)
253+
launch_router(router_args)
246254

247255

248256
if __name__ == "__main__":

sgl-router/py_src/sglang_router/launch_server.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def run_server(server_args, dp_rank):
6868
# create new process group
6969
os.setpgrp()
7070

71-
setproctitle(f"sglang::server")
71+
setproctitle("sglang::server")
7272
# Set SGLANG_DP_RANK environment variable
7373
os.environ["SGLANG_DP_RANK"] = str(dp_rank)
7474

@@ -120,9 +120,26 @@ def find_available_ports(base_port: int, count: int) -> List[int]:
120120

121121
def cleanup_processes(processes: List[mp.Process]):
122122
for process in processes:
123-
logger.info(f"Terminating process {process.pid}")
124-
process.terminate()
125-
logger.info("All processes terminated")
123+
logger.info(f"Terminating process group {process.pid}")
124+
try:
125+
os.killpg(process.pid, signal.SIGTERM)
126+
except ProcessLookupError:
127+
# Process group may already be terminated
128+
pass
129+
130+
# Wait for processes to terminate
131+
for process in processes:
132+
process.join(timeout=5)
133+
if process.is_alive():
134+
logger.warning(
135+
f"Process {process.pid} did not terminate gracefully, forcing kill"
136+
)
137+
try:
138+
os.killpg(process.pid, signal.SIGKILL)
139+
except ProcessLookupError:
140+
pass
141+
142+
logger.info("All process groups terminated")
126143

127144

128145
def main():
@@ -173,7 +190,12 @@ def main():
173190
]
174191

175192
# Start the router
176-
router = launch_router(router_args)
193+
try:
194+
launch_router(router_args)
195+
except Exception as e:
196+
logger.error(f"Failed to start router: {e}")
197+
cleanup_processes(server_processes)
198+
sys.exit(1)
177199

178200

179201
if __name__ == "__main__":

sgl-router/py_src/sglang_router/router.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class Router:
1717
- PolicyType.CacheAware: Distribute requests based on cache state and load balance
1818
host: Host address to bind the router server. Default: '127.0.0.1'
1919
port: Port number to bind the router server. Default: 3001
20+
worker_startup_timeout_secs: Timeout in seconds for worker startup. Default: 300
2021
cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
2122
if the match rate exceeds threshold, otherwise routes to the worker with the smallest
2223
tree. Default: 0.5
@@ -37,6 +38,7 @@ def __init__(
3738
policy: PolicyType = PolicyType.RoundRobin,
3839
host: str = "127.0.0.1",
3940
port: int = 3001,
41+
worker_startup_timeout_secs: int = 300,
4042
cache_threshold: float = 0.50,
4143
balance_abs_threshold: int = 32,
4244
balance_rel_threshold: float = 1.0001,
@@ -50,6 +52,7 @@ def __init__(
5052
policy=policy,
5153
host=host,
5254
port=port,
55+
worker_startup_timeout_secs=worker_startup_timeout_secs,
5356
cache_threshold=cache_threshold,
5457
balance_abs_threshold=balance_abs_threshold,
5558
balance_rel_threshold=balance_rel_threshold,

sgl-router/py_test/test_launch_router.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def setUp(self):
2828
host="127.0.0.1",
2929
port=30000,
3030
policy="cache_aware",
31+
worker_startup_timeout_secs=600,
3132
cache_threshold=0.5,
3233
balance_abs_threshold=32,
3334
balance_rel_threshold=1.0001,

sgl-router/src/lib.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct Router {
1717
port: u16,
1818
worker_urls: Vec<String>,
1919
policy: PolicyType,
20+
worker_startup_timeout_secs: u64,
2021
cache_threshold: f32,
2122
balance_abs_threshold: usize,
2223
balance_rel_threshold: f32,
@@ -34,6 +35,7 @@ impl Router {
3435
policy = PolicyType::RoundRobin,
3536
host = String::from("127.0.0.1"),
3637
port = 3001,
38+
worker_startup_timeout_secs = 300,
3739
cache_threshold = 0.50,
3840
balance_abs_threshold = 32,
3941
balance_rel_threshold = 1.0001,
@@ -47,6 +49,7 @@ impl Router {
4749
policy: PolicyType,
4850
host: String,
4951
port: u16,
52+
worker_startup_timeout_secs: u64,
5053
cache_threshold: f32,
5154
balance_abs_threshold: usize,
5255
balance_rel_threshold: f32,
@@ -60,6 +63,7 @@ impl Router {
6063
port,
6164
worker_urls,
6265
policy,
66+
worker_startup_timeout_secs,
6367
cache_threshold,
6468
balance_abs_threshold,
6569
balance_rel_threshold,
@@ -72,9 +76,14 @@ impl Router {
7276

7377
fn start(&self) -> PyResult<()> {
7478
let policy_config = match &self.policy {
75-
PolicyType::Random => router::PolicyConfig::RandomConfig,
76-
PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig,
79+
PolicyType::Random => router::PolicyConfig::RandomConfig {
80+
timeout_secs: self.worker_startup_timeout_secs,
81+
},
82+
PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig {
83+
timeout_secs: self.worker_startup_timeout_secs,
84+
},
7785
PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
86+
timeout_secs: self.worker_startup_timeout_secs,
7887
cache_threshold: self.cache_threshold,
7988
balance_abs_threshold: self.balance_abs_threshold,
8089
balance_rel_threshold: self.balance_rel_threshold,
@@ -93,10 +102,9 @@ impl Router {
93102
max_payload_size: self.max_payload_size,
94103
})
95104
.await
96-
.unwrap();
97-
});
98-
99-
Ok(())
105+
.map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;
106+
Ok(())
107+
})
100108
}
101109
}
102110

sgl-router/src/router.rs

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use actix_web::http::header::{HeaderValue, CONTENT_TYPE};
33
use actix_web::{HttpRequest, HttpResponse};
44
use bytes::Bytes;
55
use futures_util::{StreamExt, TryStreamExt};
6-
use log::{debug, info, warn};
6+
use log::{debug, error, info, warn};
77
use std::collections::HashMap;
88
use std::fmt::Debug;
99
use std::sync::atomic::AtomicUsize;
@@ -17,9 +17,11 @@ pub enum Router {
1717
RoundRobin {
1818
worker_urls: Arc<RwLock<Vec<String>>>,
1919
current_index: AtomicUsize,
20+
timeout_secs: u64,
2021
},
2122
Random {
2223
worker_urls: Arc<RwLock<Vec<String>>>,
24+
timeout_secs: u64,
2325
},
2426
CacheAware {
2527
/*
@@ -89,43 +91,59 @@ pub enum Router {
8991
cache_threshold: f32,
9092
balance_abs_threshold: usize,
9193
balance_rel_threshold: f32,
94+
timeout_secs: u64,
9295
_eviction_thread: Option<thread::JoinHandle<()>>,
9396
},
9497
}
9598

9699
#[derive(Debug, Clone)]
97100
pub enum PolicyConfig {
98-
RandomConfig,
99-
RoundRobinConfig,
101+
RandomConfig {
102+
timeout_secs: u64,
103+
},
104+
RoundRobinConfig {
105+
timeout_secs: u64,
106+
},
100107
CacheAwareConfig {
101108
cache_threshold: f32,
102109
balance_abs_threshold: usize,
103110
balance_rel_threshold: f32,
104111
eviction_interval_secs: u64,
105112
max_tree_size: usize,
113+
timeout_secs: u64,
106114
},
107115
}
108116

109117
impl Router {
110118
pub fn new(worker_urls: Vec<String>, policy_config: PolicyConfig) -> Result<Self, String> {
119+
// Get timeout from policy config
120+
let timeout_secs = match &policy_config {
121+
PolicyConfig::RandomConfig { timeout_secs } => *timeout_secs,
122+
PolicyConfig::RoundRobinConfig { timeout_secs } => *timeout_secs,
123+
PolicyConfig::CacheAwareConfig { timeout_secs, .. } => *timeout_secs,
124+
};
125+
111126
// Wait until all workers are healthy
112-
Self::wait_for_healthy_workers(&worker_urls, 300, 10)?;
127+
Self::wait_for_healthy_workers(&worker_urls, timeout_secs, 10)?;
113128

114129
// Create router based on policy...
115130
Ok(match policy_config {
116-
PolicyConfig::RandomConfig => Router::Random {
131+
PolicyConfig::RandomConfig { timeout_secs } => Router::Random {
117132
worker_urls: Arc::new(RwLock::new(worker_urls)),
133+
timeout_secs,
118134
},
119-
PolicyConfig::RoundRobinConfig => Router::RoundRobin {
135+
PolicyConfig::RoundRobinConfig { timeout_secs } => Router::RoundRobin {
120136
worker_urls: Arc::new(RwLock::new(worker_urls)),
121137
current_index: std::sync::atomic::AtomicUsize::new(0),
138+
timeout_secs,
122139
},
123140
PolicyConfig::CacheAwareConfig {
124141
cache_threshold,
125142
balance_abs_threshold,
126143
balance_rel_threshold,
127144
eviction_interval_secs,
128145
max_tree_size,
146+
timeout_secs,
129147
} => {
130148
let mut running_queue = HashMap::new();
131149
for url in &worker_urls {
@@ -176,6 +194,7 @@ impl Router {
176194
cache_threshold,
177195
balance_abs_threshold,
178196
balance_rel_threshold,
197+
timeout_secs,
179198
_eviction_thread: Some(eviction_thread),
180199
}
181200
}
@@ -192,6 +211,10 @@ impl Router {
192211

193212
loop {
194213
if start_time.elapsed() > Duration::from_secs(timeout_secs) {
214+
error!(
215+
"Timeout {}s waiting for workers to become healthy",
216+
timeout_secs
217+
);
195218
return Err(format!(
196219
"Timeout {}s waiting for workers to become healthy",
197220
timeout_secs
@@ -238,7 +261,7 @@ impl Router {
238261
fn select_first_worker(&self) -> Result<String, String> {
239262
match self {
240263
Router::RoundRobin { worker_urls, .. }
241-
| Router::Random { worker_urls }
264+
| Router::Random { worker_urls, .. }
242265
| Router::CacheAware { worker_urls, .. } => {
243266
if worker_urls.read().unwrap().is_empty() {
244267
Err("No workers are available".to_string())
@@ -349,6 +372,7 @@ impl Router {
349372
Router::RoundRobin {
350373
worker_urls,
351374
current_index,
375+
..
352376
} => {
353377
let idx = current_index
354378
.fetch_update(
@@ -360,7 +384,7 @@ impl Router {
360384
worker_urls.read().unwrap()[idx].clone()
361385
}
362386

363-
Router::Random { worker_urls } => worker_urls.read().unwrap()
387+
Router::Random { worker_urls, .. } => worker_urls.read().unwrap()
364388
[rand::random::<usize>() % worker_urls.read().unwrap().len()]
365389
.clone(),
366390

@@ -571,13 +595,21 @@ impl Router {
571595

572596
pub async fn add_worker(&self, worker_url: &str) -> Result<String, String> {
573597
let interval_secs = 10; // check every 10 seconds
574-
let timeout_secs = 300; // 5 minutes
598+
let timeout_secs = match self {
599+
Router::Random { timeout_secs, .. } => *timeout_secs,
600+
Router::RoundRobin { timeout_secs, .. } => *timeout_secs,
601+
Router::CacheAware { timeout_secs, .. } => *timeout_secs,
602+
};
575603

576604
let start_time = std::time::Instant::now();
577605
let client = reqwest::Client::new();
578606

579607
loop {
580608
if start_time.elapsed() > Duration::from_secs(timeout_secs) {
609+
error!(
610+
"Timeout {}s waiting for worker {} to become healthy",
611+
timeout_secs, worker_url
612+
);
581613
return Err(format!(
582614
"Timeout {}s waiting for worker {} to become healthy",
583615
timeout_secs, worker_url
@@ -589,7 +621,7 @@ impl Router {
589621
if res.status().is_success() {
590622
match self {
591623
Router::RoundRobin { worker_urls, .. }
592-
| Router::Random { worker_urls }
624+
| Router::Random { worker_urls, .. }
593625
| Router::CacheAware { worker_urls, .. } => {
594626
info!("Worker {} health check passed", worker_url);
595627
let mut urls = worker_urls.write().unwrap();
@@ -663,7 +695,7 @@ impl Router {
663695
pub fn remove_worker(&self, worker_url: &str) {
664696
match self {
665697
Router::RoundRobin { worker_urls, .. }
666-
| Router::Random { worker_urls }
698+
| Router::Random { worker_urls, .. }
667699
| Router::CacheAware { worker_urls, .. } => {
668700
let mut urls = worker_urls.write().unwrap();
669701
if let Some(index) = urls.iter().position(|url| url == &worker_url) {

0 commit comments

Comments
 (0)