[KV Chunk]Add NPUW_LLM_PREFILL_ENABLE_KV_CHUNK.

intelgaoxiong · intelgaoxiong · commit a6abb364631e · 2025-08-30T00:11:48.000+08:00
Signed-off-by: intelgaoxiong &lt;xiong.gao@intel.com&gt;
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -111,6 +111,7 @@ DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len
 DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, true, npuw::llm::optimize_v_tensors, RunTime);
 DEFINE_OPT(NPUW_LLM_CACHE_ROPE, bool, true, npuw::llm::cache_rope, CompileTime);
 DEFINE_OPT(NPUW_LLM_PREFILL_CHUNK_SIZE, uint64_t, 1024, npuw::llm::prefill_chunk_size, RunTime);
+DEFINE_OPT(NPUW_LLM_PREFILL_ENABLE_KV_CHUNK, bool, false, npuw::llm::prefill_enable_kv_chunk, RunTime);
 DEFINE_OPT(NPUW_LLM_SHARED_HEAD, bool, true, npuw::llm::shared_lm_head, CompileTime);
 DEFINE_OPT(NPUW_LLM_MAX_LORA_RANK, uint32_t, 32, npuw::llm::max_lora_rank, RunTime);
 DEFINE_OPT(NPUW_LLM_ENABLE_PREFIX_CACHING, bool, false, npuw::llm::enable_prefix_caching, RunTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -466,6 +466,15 @@ static constexpr ov::Property<bool> cache_rope{"NPUW_LLM_CACHE_ROPE"};
  */
 static constexpr ov::Property<uint64_t> prefill_chunk_size{"NPUW_LLM_PREFILL_CHUNK_SIZE"};
 
+/**
+ * @brief
+ * Type: bool.
+ * Enable KV chunk prefill.
+ * By default only Q chunk is applied with chunk prefill, this option allows to enable K&V chunk as well.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> prefill_enable_kv_chunk{"NPUW_LLM_PREFILL_ENABLE_KV_CHUNK"};
+
 /**
  * @brief
  * Type: bool.
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -66,6 +66,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
     desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
     desc.add<NPUW_LLM_CACHE_ROPE>();
     desc.add<NPUW_LLM_PREFILL_CHUNK_SIZE>();
+    desc.add<NPUW_LLM_PREFILL_ENABLE_KV_CHUNK>();
     desc.add<NPUW_LLM_ENABLE_PREFIX_CACHING>();
     desc.add<NPUW_LLM_PREFIX_CACHING_BLOCK_SIZE>();
     desc.add<NPUW_LLM_PREFIX_CACHING_MAX_NUM_BLOCKS>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -8,6 +8,7 @@
 #include <openvino/core/parallel.hpp>
 #include <string>
 
+#include "intel_npu/config/npuw.hpp"
 #include "llm_infer_request.hpp"
 #include "logging.hpp"
 #include "openvino/op/group_query_attention.hpp"
@@ -875,6 +876,10 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
             }
             max_prompt_len = align_to(max_prompt_len, static_cast<uint32_t>(m_prefill_chunk_size));
         }
+    }
+
+    if (m_use_chunk_prefill) {
+        m_enable_kv_chunk = m_cfg.get<::intel_npu::NPUW_LLM_PREFILL_ENABLE_KV_CHUNK>();
 
         m_enable_prefix_caching = m_cfg.get<::intel_npu::NPUW_LLM_ENABLE_PREFIX_CACHING>();
         if (m_enable_prefix_caching) {
@@ -888,26 +893,25 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
             m_prefix_caching_max_num_blocks = m_cfg.get<::intel_npu::NPUW_LLM_PREFIX_CACHING_MAX_NUM_BLOCKS>();
             LOG_INFO("Prefix caching block size: " << m_prefix_caching_block_size);
             LOG_INFO("Prefix caching maximum number of blocks: " << m_prefix_caching_max_num_blocks);
+
+            // Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
+            // restore KV from the cache
+            m_enable_kv_chunk = false;
         }
     }
 
     LOG_VERB("Enabled prefill chunking: " << m_use_chunk_prefill);
+    LOG_VERB("Enabled KV chunking: " << m_enable_kv_chunk);
     LOG_VERB("Prefill chunk size: " << m_prefill_chunk_size);
     LOG_VERB("Maximum prompt length: " << max_prompt_len);
 
     m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
     LOG_DEBUG("Make prefill model with static shapes");
     m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>();
 
-    bool chunk_kv = true;
-    if (m_enable_prefix_caching) {
-        // Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
-        // restore KV from the cache
-        chunk_kv = false;
-    }
     std::vector<std::shared_ptr<ov::Model>> prefill_models;
     if (m_use_chunk_prefill) {
-        if (!chunk_kv) {
+        if (!m_enable_kv_chunk) {
             reshape_to_static(prefill_model,
                               static_cast<uint32_t>(m_prefill_chunk_size),
                               m_kvcache_desc.max_prompt_size,
@@ -935,7 +939,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
             }
         }
     } else {
-        chunk_kv = false;
         reshape_to_static(prefill_model,
                           m_kvcache_desc.max_prompt_size,
                           m_kvcache_desc.max_prompt_size,
@@ -1168,6 +1171,7 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw:
         write(model_stream, m_kvcache_desc.v_tensors_transposed);
         write(model_stream, m_prefill_chunk_size);
         write(model_stream, m_use_chunk_prefill);
+        write(model_stream, m_enable_kv_chunk);
         write(model_stream, m_max_lora_rank);
         write(model_stream, m_enable_prefix_caching);
         write(model_stream, m_prefix_caching_block_size);
@@ -1388,6 +1392,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
         read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed);
         read(model_stream, compiled->m_prefill_chunk_size);
         read(model_stream, compiled->m_use_chunk_prefill);
+        read(model_stream, compiled->m_enable_kv_chunk);
         read(model_stream, compiled->m_max_lora_rank);
         read(model_stream, compiled->m_enable_prefix_caching);
         read(model_stream, compiled->m_prefix_caching_block_size);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp
@@ -72,6 +72,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
     KVCacheDesc m_kvcache_desc;
     uint64_t m_prefill_chunk_size = 0;
     bool m_use_chunk_prefill = false;
+    bool m_enable_kv_chunk = false;
     std::shared_ptr<ov::npuw::CompiledModel> m_kvcache_compiled;
     // For KV chunking, we will have multiple prefill models with different past KV shapes
     // For others, we will have a single prefill model