Only fast exit for non-shm cases

tgerdesnv · tgerdesnv · commit 2ca6e64c70d0 · 2024-06-05T08:26:48.000-05:00
diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
@@ -298,7 +298,7 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
     // Add the request record to thread request records vector with
     // proper locking
     std::lock_guard<std::mutex> lock(thread_stat_->mu_);
-    if (exiting_) {
+    if (exiting_ && fast_exit_) {
       return;
     }
 
diff --git a/src/c++/perf_analyzer/infer_context.h b/src/c++/perf_analyzer/infer_context.h
@@ -105,7 +105,11 @@ class InferContext {
   void Init();
 
   // Signal to the context to stop working and exit
-  void Exit() { exiting_ = true; }
+  void Exit(bool fast_exit)
+  {
+    exiting_ = true;
+    fast_exit_ = fast_exit;
+  }
 
   // Send a single inference request to the server
   void SendInferRequest(bool delayed = false);
@@ -196,6 +200,7 @@ class InferContext {
   const uint32_t id_{0};
   const size_t thread_id_{0};
   bool exiting_{false};
+  bool fast_exit_{false};
 
   size_t GetNumActiveThreads() { return num_active_threads_; }
 
diff --git a/src/c++/perf_analyzer/iworker.h b/src/c++/perf_analyzer/iworker.h
@@ -33,7 +33,7 @@ namespace triton { namespace perfanalyzer {
 class IWorker {
  public:
   virtual void Infer() = 0;
-  virtual void Exit() = 0;
+  virtual void Exit(bool fast_exit) = 0;
 };
 
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/load_manager.cc b/src/c++/perf_analyzer/load_manager.cc
@@ -164,8 +164,8 @@ LoadManager::LoadManager(
     const std::unordered_map<std::string, cb::RequestParameter>&
         request_parameters)
     : async_(async), streaming_(streaming), batch_size_(batch_size),
-      max_threads_(max_threads), parser_(parser), factory_(factory),
-      using_json_data_(false)
+      max_threads_(max_threads), shared_memory_type_{shared_memory_type},
+      parser_(parser), factory_(factory), using_json_data_(false)
 {
   on_sequence_model_ =
       ((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
@@ -248,9 +248,11 @@ LoadManager::InitManagerInputs(
 void
 LoadManager::StopWorkerThreads()
 {
+  bool fast_exit = shared_memory_type_ == SharedMemoryType::NO_SHARED_MEMORY;
+
   // FIXME do I need to acquire the lock first?
   for (auto& worker : workers_) {
-    worker->Exit();
+    worker->Exit(fast_exit);
   }
 
   {
diff --git a/src/c++/perf_analyzer/load_manager.h b/src/c++/perf_analyzer/load_manager.h
@@ -140,6 +140,7 @@ class LoadManager {
   size_t batch_size_;
   size_t max_threads_;
   bool on_sequence_model_;
+  SharedMemoryType shared_memory_type_;
 
   std::shared_ptr<ModelParser> parser_;
   std::shared_ptr<cb::ClientBackendFactory> factory_;
diff --git a/src/c++/perf_analyzer/load_worker.cc b/src/c++/perf_analyzer/load_worker.cc
@@ -35,13 +35,14 @@
 namespace triton { namespace perfanalyzer {
 
 void
-LoadWorker::Exit()
+LoadWorker::Exit(bool fast_exit)
 {
   for (auto ctx : ctxs_) {
-    ctx->Exit();
+    ctx->Exit(fast_exit);
   }
 
   exiting_ = true;
+  fast_exit_ = fast_exit;
 
   {
     std::lock_guard<std::mutex> lk(cb_mtx_);
@@ -67,6 +68,9 @@ LoadWorker::HandleExitConditions()
 {
   if (ShouldExit()) {
     CompleteOngoingSequences();
+    if (!fast_exit_) {
+      WaitForOngoingRequests();
+    }
     return true;
   }
   return false;
@@ -86,7 +90,7 @@ LoadWorker::CompleteOngoingSequences()
 void
 LoadWorker::WaitForOngoingRequests()
 {
-  while (GetNumOngoingRequests() != 0) {
+  while (GetNumOngoingRequests() != 0 && !fast_exit_) {
     std::this_thread::sleep_for(std::chrono::milliseconds(50));
   }
 }
diff --git a/src/c++/perf_analyzer/load_worker.h b/src/c++/perf_analyzer/load_worker.h
@@ -69,7 +69,7 @@ class LoadWorker : public IWorker {
 
   virtual ~LoadWorker() = default;
 
-  virtual void Exit() override;
+  virtual void Exit(bool fast_exit) override;
 
  protected:
   // Return the total number of async requests that have started and not
@@ -120,6 +120,7 @@ class LoadWorker : public IWorker {
   void AsyncCallbackFinalize(uint32_t ctx_id);
 
   bool exiting_ = false;
+  bool fast_exit_ = false;
 
   uint32_t id_;
 
diff --git a/src/c++/perf_analyzer/test_concurrency_manager.cc b/src/c++/perf_analyzer/test_concurrency_manager.cc
@@ -474,7 +474,7 @@ TEST_CASE("concurrency_free_ctx_ids")
 
   std::this_thread::sleep_for(std::chrono::milliseconds(15));
 
-  worker->Exit();
+  worker->Exit(false);
   infer_future.get();
 
   // The first sequence should only be called two times, once at the very start,
@@ -590,7 +590,7 @@ TEST_CASE("Concurrency - shared memory infer input calls")
 
   std::this_thread::sleep_for(std::chrono::milliseconds(18));
 
-  worker->Exit();
+  worker->Exit(false);
   infer_future.get();
 
   const auto& actual_append_raw_calls{tcm.stats_->num_append_raw_calls};
diff --git a/src/c++/perf_analyzer/test_request_rate_manager.cc b/src/c++/perf_analyzer/test_request_rate_manager.cc
@@ -975,7 +975,7 @@ TEST_CASE("request_rate_streaming: test that streaming-specific logic works")
   std::dynamic_pointer_cast<IScheduler>(worker)->SetSchedule(schedule);
   std::future<void> infer_future{std::async(&IWorker::Infer, worker)};
 
-  worker->Exit();
+  worker->Exit(false);
   infer_future.get();
 
   CHECK(
@@ -1825,7 +1825,7 @@ TEST_CASE("Request rate - Shared memory infer input calls")
 
   std::this_thread::sleep_for(milliseconds(18));
 
-  worker->Exit();
+  worker->Exit(false);
   infer_future.get();
 
   const auto& actual_append_raw_calls{trrm.stats_->num_append_raw_calls};

Original file line number	Diff line number	Diff line change
`@@ -298,7 +298,7 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)`
`298`	`298`	`// Add the request record to thread request records vector with`
`299`	`299`	`// proper locking`
`300`	`300`	`std::lock_guard<std::mutex> lock(thread_stat_->mu_);`
`301`		`- if (exiting_) {`
	`301`	`+ if (exiting_ && fast_exit_) {`
`302`	`302`	`return;`
`303`	`303`	`}`
`304`	`304`
Original file line number	Diff line number	Diff line change
`@@ -164,8 +164,8 @@ LoadManager::LoadManager(`
`164`	`164`	`const std::unordered_map<std::string, cb::RequestParameter>&`
`165`	`165`	`request_parameters)`
`166`	`166`	`: async_(async), streaming_(streaming), batch_size_(batch_size),`
`167`		`- max_threads_(max_threads), parser_(parser), factory_(factory),`
`168`		`- using_json_data_(false)`
	`167`	`+ max_threads_(max_threads), shared_memory_type_{shared_memory_type},`
	`168`	`+ parser_(parser), factory_(factory), using_json_data_(false)`
`169`	`169`	`{`
`170`	`170`	`on_sequence_model_ =`
`171`	`171`	`((parser_->SchedulerType() == ModelParser::SEQUENCE) \|\|`
`@@ -248,9 +248,11 @@ LoadManager::InitManagerInputs(`
`248`	`248`	`void`
`249`	`249`	`LoadManager::StopWorkerThreads()`
`250`	`250`	`{`
	`251`	`+ bool fast_exit = shared_memory_type_ == SharedMemoryType::NO_SHARED_MEMORY;`
	`252`	`+`
`251`	`253`	`// FIXME do I need to acquire the lock first?`
`252`	`254`	`for (auto& worker : workers_) {`
`253`		`- worker->Exit();`
	`255`	`+ worker->Exit(fast_exit);`
`254`	`256`	`}`
`255`	`257`
`256`	`258`	`{`
Original file line number	Diff line number	Diff line change
`@@ -35,13 +35,14 @@`
`35`	`35`	`namespace triton { namespace perfanalyzer {`
`36`	`36`
`37`	`37`	`void`
`38`		`-LoadWorker::Exit()`
	`38`	`+LoadWorker::Exit(bool fast_exit)`
`39`	`39`	`{`
`40`	`40`	`for (auto ctx : ctxs_) {`
`41`		`- ctx->Exit();`
	`41`	`+ ctx->Exit(fast_exit);`
`42`	`42`	`}`
`43`	`43`
`44`	`44`	`exiting_ = true;`
	`45`	`+ fast_exit_ = fast_exit;`
`45`	`46`
`46`	`47`	`{`
`47`	`48`	`std::lock_guard<std::mutex> lk(cb_mtx_);`
`@@ -67,6 +68,9 @@ LoadWorker::HandleExitConditions()`
`67`	`68`	`{`
`68`	`69`	`if (ShouldExit()) {`
`69`	`70`	`CompleteOngoingSequences();`
	`71`	`+ if (!fast_exit_) {`
	`72`	`+ WaitForOngoingRequests();`
	`73`	`+ }`
`70`	`74`	`return true;`
`71`	`75`	`}`
`72`	`76`	`return false;`
`@@ -86,7 +90,7 @@ LoadWorker::CompleteOngoingSequences()`
`86`	`90`	`void`
`87`	`91`	`LoadWorker::WaitForOngoingRequests()`
`88`	`92`	`{`
`89`		`- while (GetNumOngoingRequests() != 0) {`
	`93`	`+ while (GetNumOngoingRequests() != 0 && !fast_exit_) {`
`90`	`94`	`std::this_thread::sleep_for(std::chrono::milliseconds(50));`
`91`	`95`	`}`
`92`	`96`	`}`