sgl-project · xiezhq-hermann · Sep 3, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025
@@ -468,9 +468,9 @@ def check_prefetch_progress(self, req_id: str) -> bool:
 
         # todo: more policies for prefetch progress such as timeout
         # the current policy is to prefetch with best effort and terminate when queuing is over
-        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch.pop(
+        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch[
             req_id
-        )
+        ]
 
         if operation.host_indices is None:
             # prefetch has not been issued due to insufficient host memory
@@ -512,6 +512,7 @@ def check_prefetch_progress(self, req_id: str) -> bool:
             host_indices[min_completed_tokens:completed_tokens]
         )
         last_host_node.release_host()
+        del self.ongoing_prefetch[req_id]
         self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
 
         return True
@@ -775,15 +776,14 @@ def release_aborted_request(self, rid: str):
         if rid not in self.ongoing_prefetch:
             return
 
-        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch.pop(
-            rid
-        )
+        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch[rid]
         if operation.host_indices is None:
             return
 
         completed_tokens, _ = self.cache_controller.terminate_prefetch(operation)
         if self.tp_world_size > 1:
             torch.distributed.barrier(group=self.tp_group)
         last_host_node.release_host()
+        del self.ongoing_prefetch[rid]
         self.cache_controller.append_host_mem_release(host_indices[:completed_tokens])
         self.cache_controller.prefetch_tokens_occupied -= len(token_ids)