8
8
#include < openvino/core/parallel.hpp>
9
9
#include < string>
10
10
11
+ #include " intel_npu/config/npuw.hpp"
11
12
#include " llm_infer_request.hpp"
12
13
#include " logging.hpp"
13
14
#include " openvino/op/group_query_attention.hpp"
@@ -875,6 +876,10 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
875
876
}
876
877
max_prompt_len = align_to (max_prompt_len, static_cast <uint32_t >(m_prefill_chunk_size));
877
878
}
879
+ }
880
+
881
+ if (m_use_chunk_prefill) {
882
+ m_enable_kv_chunk = m_cfg.get <::intel_npu::NPUW_LLM_PREFILL_ENABLE_KV_CHUNK>();
878
883
879
884
m_enable_prefix_caching = m_cfg.get <::intel_npu::NPUW_LLM_ENABLE_PREFIX_CACHING>();
880
885
if (m_enable_prefix_caching) {
@@ -888,26 +893,25 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
888
893
m_prefix_caching_max_num_blocks = m_cfg.get <::intel_npu::NPUW_LLM_PREFIX_CACHING_MAX_NUM_BLOCKS>();
889
894
LOG_INFO (" Prefix caching block size: " << m_prefix_caching_block_size);
890
895
LOG_INFO (" Prefix caching maximum number of blocks: " << m_prefix_caching_max_num_blocks);
896
+
897
+ // Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
898
+ // restore KV from the cache
899
+ m_enable_kv_chunk = false ;
891
900
}
892
901
}
893
902
894
903
LOG_VERB (" Enabled prefill chunking: " << m_use_chunk_prefill);
904
+ LOG_VERB (" Enabled KV chunking: " << m_enable_kv_chunk);
895
905
LOG_VERB (" Prefill chunk size: " << m_prefill_chunk_size);
896
906
LOG_VERB (" Maximum prompt length: " << max_prompt_len);
897
907
898
908
m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u , seq_len_dim};
899
909
LOG_DEBUG (" Make prefill model with static shapes" );
900
910
m_max_lora_rank = m_cfg.get <::intel_npu::NPUW_LLM_MAX_LORA_RANK>();
901
911
902
- bool chunk_kv = true ;
903
- if (m_enable_prefix_caching) {
904
- // Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
905
- // restore KV from the cache
906
- chunk_kv = false ;
907
- }
908
912
std::vector<std::shared_ptr<ov::Model>> prefill_models;
909
913
if (m_use_chunk_prefill) {
910
- if (!chunk_kv ) {
914
+ if (!m_enable_kv_chunk ) {
911
915
reshape_to_static (prefill_model,
912
916
static_cast <uint32_t >(m_prefill_chunk_size),
913
917
m_kvcache_desc.max_prompt_size ,
@@ -935,7 +939,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
935
939
}
936
940
}
937
941
} else {
938
- chunk_kv = false ;
939
942
reshape_to_static (prefill_model,
940
943
m_kvcache_desc.max_prompt_size ,
941
944
m_kvcache_desc.max_prompt_size ,
@@ -1168,6 +1171,7 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw:
1168
1171
write (model_stream, m_kvcache_desc.v_tensors_transposed );
1169
1172
write (model_stream, m_prefill_chunk_size);
1170
1173
write (model_stream, m_use_chunk_prefill);
1174
+ write (model_stream, m_enable_kv_chunk);
1171
1175
write (model_stream, m_max_lora_rank);
1172
1176
write (model_stream, m_enable_prefix_caching);
1173
1177
write (model_stream, m_prefix_caching_block_size);
@@ -1388,6 +1392,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
1388
1392
read (model_stream, compiled->m_kvcache_desc .v_tensors_transposed );
1389
1393
read (model_stream, compiled->m_prefill_chunk_size );
1390
1394
read (model_stream, compiled->m_use_chunk_prefill );
1395
+ read (model_stream, compiled->m_enable_kv_chunk );
1391
1396
read (model_stream, compiled->m_max_lora_rank );
1392
1397
read (model_stream, compiled->m_enable_prefix_caching );
1393
1398
read (model_stream, compiled->m_prefix_caching_block_size );
0 commit comments