@@ -840,8 +840,8 @@ uint64_t ov::npuw::LLMInferRequest::restore_cached_blocks(
840
840
? 3u
841
841
: kvcache_desc.dim ;
842
842
843
- // Fixme: support prefix-caching with KV Chunk
844
- auto kv_dst_tensor = m_prefill_requests.front ()->get_tensor (m_prefill_in_ports.front ().at (kv_in_name));
843
+ // There is a single model for prefix-caching
844
+ auto kv_dst_tensor = m_prefill_requests.back ()->get_tensor (m_prefill_in_ports.back ().at (kv_in_name));
845
845
auto kv_dst_slice = make_tensor_slice (kv_dst_tensor,
846
846
kv_dim,
847
847
static_cast <uint32_t >(token_start),
@@ -875,8 +875,8 @@ void ov::npuw::LLMInferRequest::store_blocks_in_cache(size_t chunk_size,
875
875
}
876
876
877
877
auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
878
- // Fixme: support prefix-caching with KV Chunk
879
- const auto & prefill_compiled = m_prefill_requests.front ()->get_compiled_model ();
878
+ // There is a single model for prefix-caching
879
+ const auto & prefill_compiled = m_prefill_requests.back ()->get_compiled_model ();
880
880
881
881
// Process input chunk in blocks
882
882
const uint64_t chunk_prompt_len = m_npuw_llm_compiled_model->m_prefill_chunk_size ;
@@ -903,8 +903,8 @@ void ov::npuw::LLMInferRequest::store_blocks_in_cache(size_t chunk_size,
903
903
? 3u
904
904
: kvcache_desc.dim ;
905
905
906
- // Fixme: support prefix-caching with KV Chunk
907
- auto kv_src_tensor = m_prefill_requests.front ()->get_tensor (m_prefill_out_ports.front ().at (output_name));
906
+ // There is a single model for prefix-caching
907
+ auto kv_src_tensor = m_prefill_requests.back ()->get_tensor (m_prefill_out_ports.back ().at (output_name));
908
908
auto kv_src_slice = make_tensor_slice (kv_src_tensor,
909
909
kv_dim,
910
910
static_cast <uint32_t >(block_start),
@@ -993,9 +993,10 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
993
993
prompt_hashes = calculate_hashes (input_ids);
994
994
995
995
// Create output to input ports name for convenience
996
- // Fixme: support prefix-caching with KV Chunk
997
- const auto & prefill_compiled = m_prefill_requests.front ()->get_compiled_model ();
998
- input_name_map = create_output_to_input_name_mapping (prefill_compiled, m_prefill_in_ports.front ());
996
+
997
+ // There is a single model for prefix-caching
998
+ const auto & prefill_compiled = m_prefill_requests.back ()->get_compiled_model ();
999
+ input_name_map = create_output_to_input_name_mapping (prefill_compiled, m_prefill_in_ports.back ());
999
1000
1000
1001
// Try to restore prefilled prompts from cache
1001
1002
auto restored_token_num =
0 commit comments