Skip to content

Commit 28bb3ff

Browse files
committed
[KV Chunk]Parallel compilation and disable kv chunk with prefix caching.
Signed-off-by: intelgaoxiong <xiong.gao@intel.com>
1 parent 6d070d6 commit 28bb3ff

File tree

2 files changed

+24
-15
lines changed

2 files changed

+24
-15
lines changed

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include <cstddef>
77
#include <cstdint>
8+
#include <openvino/core/parallel.hpp>
89
#include <string>
910

1011
#include "llm_infer_request.hpp"
@@ -899,6 +900,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
899900
m_max_lora_rank = m_cfg.get<::intel_npu::NPUW_LLM_MAX_LORA_RANK>();
900901

901902
bool chunk_kv = true;
903+
if (m_enable_prefix_caching) {
904+
// Not using KV chunking with prefix caching because it's hard to predict which prefill model to be used to
905+
// restore KV from the cache
906+
chunk_kv = false;
907+
}
902908
std::vector<std::shared_ptr<ov::Model>> prefill_models;
903909
if (m_use_chunk_prefill) {
904910
if (!chunk_kv) {
@@ -1036,16 +1042,18 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
10361042
ov::npuw::ICompiledModel::create(kvcache_model, plugin, generate_config));
10371043
NPUW_ASSERT(m_kvcache_compiled && "Can't create ov::npuw::CompiledModel for passed kvcache "
10381044
"model and its config, please check passed config.");
1039-
for (auto prefill_model : prefill_models) {
1045+
1046+
m_prefill_compiled.resize(prefill_models.size());
1047+
ov::parallel_for(prefill_models.size(), [&](size_t idx) {
1048+
auto prefill_model = prefill_models[idx];
10401049
std::cout << "Start to compile prefill model: " << prefill_model->get_friendly_name() << std::endl;
10411050
auto compiled = std::dynamic_pointer_cast<ov::npuw::CompiledModel>(
10421051
ov::npuw::ICompiledModel::create(prefill_model, plugin, prefill_config));
1043-
NPUW_ASSERT(compiled && "Can't create ov::npuw::CompiledModel for passed prefill "
1044-
"model and its config, please check passed config.");
1045-
m_prefill_compiled.push_back(std::move(compiled));
1046-
1052+
NPUW_ASSERT(compiled && "Can't create ov::npuw::CompiledModel for passed prefill model and its config, please "
1053+
"check passed config.");
1054+
m_prefill_compiled[idx] = std::move(compiled);
10471055
std::cout << "Finished compilation for prefill model: " << prefill_model->get_friendly_name() << std::endl;
1048-
}
1056+
});
10491057

10501058
if (lm_head_model) {
10511059
auto lm_head_config = get_default_lm_head_config(npudesc);

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -840,8 +840,8 @@ uint64_t ov::npuw::LLMInferRequest::restore_cached_blocks(
840840
? 3u
841841
: kvcache_desc.dim;
842842

843-
// Fixme: support prefix-caching with KV Chunk
844-
auto kv_dst_tensor = m_prefill_requests.front()->get_tensor(m_prefill_in_ports.front().at(kv_in_name));
843+
// There is a single model for prefix-caching
844+
auto kv_dst_tensor = m_prefill_requests.back()->get_tensor(m_prefill_in_ports.back().at(kv_in_name));
845845
auto kv_dst_slice = make_tensor_slice(kv_dst_tensor,
846846
kv_dim,
847847
static_cast<uint32_t>(token_start),
@@ -875,8 +875,8 @@ void ov::npuw::LLMInferRequest::store_blocks_in_cache(size_t chunk_size,
875875
}
876876

877877
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
878-
// Fixme: support prefix-caching with KV Chunk
879-
const auto& prefill_compiled = m_prefill_requests.front()->get_compiled_model();
878+
// There is a single model for prefix-caching
879+
const auto& prefill_compiled = m_prefill_requests.back()->get_compiled_model();
880880

881881
// Process input chunk in blocks
882882
const uint64_t chunk_prompt_len = m_npuw_llm_compiled_model->m_prefill_chunk_size;
@@ -903,8 +903,8 @@ void ov::npuw::LLMInferRequest::store_blocks_in_cache(size_t chunk_size,
903903
? 3u
904904
: kvcache_desc.dim;
905905

906-
// Fixme: support prefix-caching with KV Chunk
907-
auto kv_src_tensor = m_prefill_requests.front()->get_tensor(m_prefill_out_ports.front().at(output_name));
906+
// There is a single model for prefix-caching
907+
auto kv_src_tensor = m_prefill_requests.back()->get_tensor(m_prefill_out_ports.back().at(output_name));
908908
auto kv_src_slice = make_tensor_slice(kv_src_tensor,
909909
kv_dim,
910910
static_cast<uint32_t>(block_start),
@@ -993,9 +993,10 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
993993
prompt_hashes = calculate_hashes(input_ids);
994994

995995
// Create output to input ports name for convenience
996-
// Fixme: support prefix-caching with KV Chunk
997-
const auto& prefill_compiled = m_prefill_requests.front()->get_compiled_model();
998-
input_name_map = create_output_to_input_name_mapping(prefill_compiled, m_prefill_in_ports.front());
996+
997+
// There is a single model for prefix-caching
998+
const auto& prefill_compiled = m_prefill_requests.back()->get_compiled_model();
999+
input_name_map = create_output_to_input_name_mapping(prefill_compiled, m_prefill_in_ports.back());
9991000

10001001
// Try to restore prefilled prompts from cache
10011002
auto restored_token_num =

0 commit comments

Comments
 (0)