Skip to content

Commit a6abb36

Browse files
committed
[KV Chunk]Add NPUW_LLM_PREFILL_ENABLE_KV_CHUNK.
Signed-off-by: intelgaoxiong <xiong.gao@intel.com>
1 parent 28bb3ff commit a6abb36

File tree

5 files changed

+25
-8
lines changed

5 files changed

+25
-8
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len
111111
DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, true, npuw::llm::optimize_v_tensors, RunTime);
112112
DEFINE_OPT(NPUW_LLM_CACHE_ROPE, bool, true, npuw::llm::cache_rope, CompileTime);
113113
DEFINE_OPT(NPUW_LLM_PREFILL_CHUNK_SIZE, uint64_t, 1024, npuw::llm::prefill_chunk_size, RunTime);
114+
DEFINE_OPT(NPUW_LLM_PREFILL_ENABLE_KV_CHUNK, bool, false, npuw::llm::prefill_enable_kv_chunk, RunTime);
114115
DEFINE_OPT(NPUW_LLM_SHARED_HEAD, bool, true, npuw::llm::shared_lm_head, CompileTime);
115116
DEFINE_OPT(NPUW_LLM_MAX_LORA_RANK, uint32_t, 32, npuw::llm::max_lora_rank, RunTime);
116117
DEFINE_OPT(NPUW_LLM_ENABLE_PREFIX_CACHING, bool, false, npuw::llm::enable_prefix_caching, RunTime);

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,15 @@ static constexpr ov::Property<bool> cache_rope{"NPUW_LLM_CACHE_ROPE"};
466466
*/
467467
static constexpr ov::Property<uint64_t> prefill_chunk_size{"NPUW_LLM_PREFILL_CHUNK_SIZE"};
468468

469+
/**
470+
* @brief
471+
* Type: bool.
472+
* Enable KV chunk prefill.
473+
* By default only Q chunk is applied with chunk prefill, this option allows to enable K&V chunk as well.
474+
* Default value: false.
475+
*/
476+
static constexpr ov::Property<bool> prefill_enable_kv_chunk{"NPUW_LLM_PREFILL_ENABLE_KV_CHUNK"};
477+
469478
/**
470479
* @brief
471480
* Type: bool.

src/plugins/intel_npu/src/al/src/config/npuw.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
6666
desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
6767
desc.add<NPUW_LLM_CACHE_ROPE>();
6868
desc.add<NPUW_LLM_PREFILL_CHUNK_SIZE>();
69+
desc.add<NPUW_LLM_PREFILL_ENABLE_KV_CHUNK>();
6970
desc.add<NPUW_LLM_ENABLE_PREFIX_CACHING>();
7071
desc.add<NPUW_LLM_PREFIX_CACHING_BLOCK_SIZE>();
7172
desc.add<NPUW_LLM_PREFIX_CACHING_MAX_NUM_BLOCKS>();

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <openvino/core/parallel.hpp>
99
#include <string>
1010

11+
#include "intel_npu/config/npuw.hpp"
1112
#include "llm_infer_request.hpp"
1213
#include "logging.hpp"
1314
#include "openvino/op/group_query_attention.hpp"
@@ -875,6 +876,10 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
875876
}
876877
max_prompt_len = align_to(max_prompt_len, static_cast<uint32_t>(m_prefill_chunk_size));
877878
}
879+
}
880+
881+
if (m_use_chunk_prefill) {
882+
m_enable_kv_chunk = m_cfg.get<::intel_npu::NPUW_LLM_PREFILL_ENABLE_KV_CHUNK>();
878883

879884
m_enable_prefix_caching = m_cfg.get<::intel_npu::NPUW_LLM_ENABLE_PREFIX_CACHING>();
880885
if (m_enable_prefix_caching) {
@@ -888,26 +893,25 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
888893
m_prefix_caching_max_num_blocks = m_cfg.get<::intel_npu::NPUW_LLM_PREFIX_CACHING_MAX_NUM_BLOCKS>();
889894
LOG_INFO("Prefix caching block size: " << m_prefix_caching_block_size);
890895
LOG_INFO("Prefix caching maximum number of blocks: " << m_prefix_caching_max_num_blocks);
896+
897+
// Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
898+
// restore KV from the cache
899+
m_enable_kv_chunk = false;
891900
}
892901
}
893902

894903
LOG_VERB("Enabled prefill chunking: " << m_use_chunk_prefill);
904+
LOG_VERB("Enabled KV chunking: " << m_enable_kv_chunk);
895905
LOG_VERB("Prefill chunk size: " << m_prefill_chunk_size);
896906
LOG_VERB("Maximum prompt length: " << max_prompt_len);
897907

898908
m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
899909
LOG_DEBUG("Make prefill model with static shapes");
900910
m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>();
901911

902-
bool chunk_kv = true;
903-
if (m_enable_prefix_caching) {
904-
// Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
905-
// restore KV from the cache
906-
chunk_kv = false;
907-
}
908912
std::vector<std::shared_ptr<ov::Model>> prefill_models;
909913
if (m_use_chunk_prefill) {
910-
if (!chunk_kv) {
914+
if (!m_enable_kv_chunk) {
911915
reshape_to_static(prefill_model,
912916
static_cast<uint32_t>(m_prefill_chunk_size),
913917
m_kvcache_desc.max_prompt_size,
@@ -935,7 +939,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
935939
}
936940
}
937941
} else {
938-
chunk_kv = false;
939942
reshape_to_static(prefill_model,
940943
m_kvcache_desc.max_prompt_size,
941944
m_kvcache_desc.max_prompt_size,
@@ -1168,6 +1171,7 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw:
11681171
write(model_stream, m_kvcache_desc.v_tensors_transposed);
11691172
write(model_stream, m_prefill_chunk_size);
11701173
write(model_stream, m_use_chunk_prefill);
1174+
write(model_stream, m_enable_kv_chunk);
11711175
write(model_stream, m_max_lora_rank);
11721176
write(model_stream, m_enable_prefix_caching);
11731177
write(model_stream, m_prefix_caching_block_size);
@@ -1388,6 +1392,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
13881392
read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed);
13891393
read(model_stream, compiled->m_prefill_chunk_size);
13901394
read(model_stream, compiled->m_use_chunk_prefill);
1395+
read(model_stream, compiled->m_enable_kv_chunk);
13911396
read(model_stream, compiled->m_max_lora_rank);
13921397
read(model_stream, compiled->m_enable_prefix_caching);
13931398
read(model_stream, compiled->m_prefix_caching_block_size);

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
7272
KVCacheDesc m_kvcache_desc;
7373
uint64_t m_prefill_chunk_size = 0;
7474
bool m_use_chunk_prefill = false;
75+
bool m_enable_kv_chunk = false;
7576
std::shared_ptr<ov::npuw::CompiledModel> m_kvcache_compiled;
7677
// For KV chunking, we will have multiple prefill models with different past KV shapes
7778
// For others, we will have a single prefill model

0 commit comments

Comments
 (0)