[train] Add Torch process group shutdown timeout (#56182)

TimothySeah · web-flow · commit b7387552bb4e · 2025-09-03T17:58:09.000Z
Shutting down a healthy torch process group, which we may want to do for reasons like restarting a group of workers if an async checkpoint upload fails, can hang. This is a workaround until we figure out how to avoid this hang. When this happens, `before_worker_group_shutdown` finishes after the timeout and then workers get killed by `ray.kill`: https://github.com/ray-project/ray/blob/master/python/ray/train/v2/_internal/execution/worker_group/state.py#L127. --------- Signed-off-by: Timothy Seah <tseah@anyscale.com>
diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py
@@ -125,6 +125,12 @@ def _v2_migration_warnings_enabled() -> bool:
     "TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE"
 )
 
+# Seconds to wait for torch process group to shut down.
+# Shutting down a healthy torch process group, which we may want to do for reasons
+# like restarting a group of workers if an async checkpoint upload fails, can hang.
+# This is a workaround until we figure out how to avoid this hang.
+TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = "TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S"
+DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S = 30
 
 # NOTE: When adding a new environment variable, please track it in this list.
 TRAIN_ENV_VARS = {
@@ -137,6 +143,7 @@ def _v2_migration_warnings_enabled() -> bool:
     RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
     RAY_TRAIN_ENABLE_STATE_TRACKING,
     TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE,
+    TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
 }
 
 # Key for AIR Checkpoint metadata in TrainingResult metadata
diff --git a/python/ray/train/tests/test_backend.py b/python/ray/train/tests/test_backend.py
@@ -28,6 +28,7 @@
 from ray.train.constants import (
     ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
     ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
+    TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
     TRAIN_ENABLE_WORKER_SPREAD_ENV,
 )
 from ray.train.torch import TorchConfig
@@ -364,6 +365,24 @@ def check_process_group():
     assert not any(e.finish_training())
 
 
+@pytest.mark.parametrize(
+    "init_method, timeout_s", [("env", 5), ("tcp", 5), ("env", 0), ("tcp", 0)]
+)
+def test_torch_process_group_shutdown_timeout(
+    ray_start_2_cpus, monkeypatch, init_method, timeout_s
+):
+    monkeypatch.setenv(TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, timeout_s)
+    torch_config = TorchConfig(backend="gloo", init_method=init_method)
+    e = BackendExecutor(torch_config, num_workers=2)
+    e.start()
+
+    _start_training(e, lambda: 1)
+    assert e.finish_training() == [1, 1]
+
+    # Verify that we do not raise an exception even if we time out
+    e._backend.on_shutdown(e.worker_group, e._backend_config)
+
+
 @pytest.mark.parametrize(
     "worker_results",
     [
diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py
@@ -10,10 +10,16 @@
 
 import ray
 from ray._common.network_utils import build_address
+from ray._private import ray_constants
 from ray.air._internal.device_manager import register_custom_torch_dist_backend
+from ray.exceptions import GetTimeoutError
 from ray.train._internal.utils import get_address_and_port
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
+from ray.train.constants import (
+    DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
+    TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
+)
 from ray.util import PublicAPI
 
 logger = logging.getLogger(__name__)
@@ -202,11 +208,21 @@ def set_env_vars(addr, port):
         else:
             raise RuntimeError("Distributed torch is not available.")
 
-    def on_shutdown(self, worker_group: WorkerGroup, backend_config: TorchConfig):
-        worker_group.execute(
+    def on_shutdown(self, worker_group, backend_config):
+        futures = worker_group.execute_async(
             _shutdown_torch,
             destroy_process_group=len(worker_group) > 1,
         )
+        timeout_s = ray_constants.env_integer(
+            TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
+            DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
+        )
+        try:
+            ray.get(futures, timeout=timeout_s)
+        except GetTimeoutError:
+            logger.warning(
+                f"Torch process group shutdown timed out after {timeout_s} seconds"
+            )
 
     def on_training_start(
         self, worker_group: WorkerGroup, backend_config: BackendConfig