[KV Chunk]Parallel compilation and disable kv chunk with prefix caching.

intelgaoxiong · intelgaoxiong · commit 28bb3ffda4ac · 2025-08-29T21:43:16.000+08:00
Signed-off-by: intelgaoxiong &lt;xiong.gao@intel.com&gt;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -5,6 +5,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <openvino/core/parallel.hpp>
 #include <string>
 
 #include "llm_infer_request.hpp"
@@ -899,6 +900,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>();
 
     bool chunk_kv = true;
+    if (m_enable_prefix_caching) {
+        // Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
+        // restore KV from the cache
+        chunk_kv = false;
+    }
     std::vector<std::shared_ptr<ov::Model>> prefill_models;
     if (m_use_chunk_prefill) {
         if (!chunk_kv) {
@@ -1036,16 +1042,18 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
         ov::npuw::ICompiledModel::create(kvcache_model, plugin, generate_config));
     NPUW_ASSERT(m_kvcache_compiled && "Can't create ov::npuw::CompiledModel for passed kvcache "
                                       "model and its config, please check passed config.");
-    for (auto prefill_model : prefill_models) {
+
+    m_prefill_compiled.resize(prefill_models.size());
+    ov::parallel_for(prefill_models.size(), [&](size_t idx) {
+        auto prefill_model = prefill_models[idx];
         std::cout << "Start to compile prefill model: " << prefill_model->get_friendly_name() << std::endl;
         auto compiled = std::dynamic_pointer_cast<ov::npuw::CompiledModel>(
             ov::npuw::ICompiledModel::create(prefill_model, plugin, prefill_config));
-        NPUW_ASSERT(compiled && "Can't create ov::npuw::CompiledModel for passed prefill "
-                                "model and its config, please check passed config.");
-        m_prefill_compiled.push_back(std::move(compiled));
-
+        NPUW_ASSERT(compiled && "Can't create ov::npuw::CompiledModel for passed prefill model and its config, please "
+                                "check passed config.");
+        m_prefill_compiled[idx] = std::move(compiled);
         std::cout << "Finished compilation for prefill model: " << prefill_model->get_friendly_name() << std::endl;
-    }
+    });
 
     if (lm_head_model) {
         auto lm_head_config = get_default_lm_head_config(npudesc);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -840,8 +840,8 @@ uint64_t ov::npuw::LLMInferRequest::restore_cached_blocks(
                                      ? 3u
                                      : kvcache_desc.dim;
 
-            // Fixme: support prefix-caching with KV Chunk
-            auto kv_dst_tensor = m_prefill_requests.front()->get_tensor(m_prefill_in_ports.front().at(kv_in_name));
+            // There is a single model for prefix-caching
+            auto kv_dst_tensor = m_prefill_requests.back()->get_tensor(m_prefill_in_ports.back().at(kv_in_name));
             auto kv_dst_slice = make_tensor_slice(kv_dst_tensor,
                                                   kv_dim,
                                                   static_cast<uint32_t>(token_start),
@@ -875,8 +875,8 @@ void ov::npuw::LLMInferRequest::store_blocks_in_cache(size_t chunk_size,
     }
 
     auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
-    // Fixme: support prefix-caching with KV Chunk
-    const auto& prefill_compiled = m_prefill_requests.front()->get_compiled_model();
+    // There is a single model for prefix-caching
+    const auto& prefill_compiled = m_prefill_requests.back()->get_compiled_model();
 
     // Process input chunk in blocks
     const uint64_t chunk_prompt_len = m_npuw_llm_compiled_model->m_prefill_chunk_size;
@@ -903,8 +903,8 @@ void ov::npuw::LLMInferRequest::store_blocks_in_cache(size_t chunk_size,
                                      ? 3u
                                      : kvcache_desc.dim;
 
-            // Fixme: support prefix-caching with KV Chunk
-            auto kv_src_tensor = m_prefill_requests.front()->get_tensor(m_prefill_out_ports.front().at(output_name));
+            // There is a single model for prefix-caching
+            auto kv_src_tensor = m_prefill_requests.back()->get_tensor(m_prefill_out_ports.back().at(output_name));
             auto kv_src_slice = make_tensor_slice(kv_src_tensor,
                                                   kv_dim,
                                                   static_cast<uint32_t>(block_start),
@@ -993,9 +993,10 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
         prompt_hashes = calculate_hashes(input_ids);
 
         // Create output to input ports name for convenience
-        // Fixme: support prefix-caching with KV Chunk
-        const auto& prefill_compiled = m_prefill_requests.front()->get_compiled_model();
-        input_name_map = create_output_to_input_name_mapping(prefill_compiled, m_prefill_in_ports.front());
+
+        // There is a single model for prefix-caching
+        const auto& prefill_compiled = m_prefill_requests.back()->get_compiled_model();
+        input_name_map = create_output_to_input_name_mapping(prefill_compiled, m_prefill_in_ports.back());
 
         // Try to restore prefilled prompts from cache
         auto restored_token_num =