Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/plugins/intel_npu/src/backend/include/zero_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ class ZeroDevice : public IDevice {
std::map<ov::element::Type, float> getGops() const override;
ov::device::Type getDeviceType() const override;

std::shared_ptr<SyncInferRequest> createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config) override;
std::shared_ptr<ov::IInferRequest> createInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config,
std::function<void(void)>& inferAsyncF,
std::function<void(void)>& getResultF) override;
void updateInfo(const Config& config) override {
log.setLevel(config.get<LOG_LEVEL>());
}
Expand Down
181 changes: 172 additions & 9 deletions src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <ze_graph_ext.h>

#include "intel_npu/common/npu.hpp"
#include "intel_npu/common/sync_infer_request.hpp"
#include "intel_npu/utils/logger/logger.hpp"
#include "intel_npu/utils/zero/zero_remote_tensor.hpp"
#include "intel_npu/utils/zero/zero_utils.hpp"
Expand All @@ -18,23 +17,162 @@

namespace intel_npu {

class ZeroInferRequest final : public SyncInferRequest {
class ZeroInferRequest final : public ov::IInferRequest {
public:
explicit ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>& initStructs,
const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config);

/**
* @brief Gets an input/output tensor for inference.
* @note If the tensor with the specified @p port is not found, an exception is thrown.
* @param port Port of the tensor to get.
* @return Tensor for the port @p port.
*/
ov::SoPtr<ov::ITensor> get_tensor(const ov::Output<const ov::Node>& port) const override;

/**
* @brief Sets an input/output tensor to infer.
* @param port Port of the input or output tensor.
* @param tensor Reference to a tensor. The element_type and shape of a tensor must match
* the model's input/output element_type and size.
*/
void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;

/**
* @brief Gets an input/output tensor for inference.
* @note If the tensor with the specified @p port is not found, am exception is thrown.
* @param port Port of the batched tensors to get.
* @return Vector of batched tensors for the input port @p port or empty vector if port is output.
*/
std::vector<ov::SoPtr<ov::ITensor>> get_tensors(const ov::Output<const ov::Node>& port) const override;

/**
* @brief Sets batched input tensors to infer
* @param port Port of the batched input tensor.
* @param tensors Vector of references to batched tensors. The element_type and shape of each must match.
* @note Batched tensors for outputs is not supported.
* @note If single element vector is provided for @p tensors param, fallback to "set_tensor" function will occur.
*/
void set_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;

/**
* @brief Gets inputs for infer request
*
* @return vector of input ports
*/
const std::vector<ov::Output<const ov::Node>>& get_inputs() const override;

/**
* @brief Gets outputs for infer request
*
* @return vector of output ports
*/
const std::vector<ov::Output<const ov::Node>>& get_outputs() const override;

/**
* @brief Gets pointer to compiled model (usually synchronous request holds the compiled model)
*
* @return Pointer to the compiled model
*/
const std::shared_ptr<const ov::ICompiledModel>& get_compiled_model() const override;

/**
* @brief Calls "infer_async" then "get_result"
*/
void infer() override;
void infer_async() override;

void get_result() override;
/**
* @brief Used for executing the inference.
*/
void infer_async();

/**
* @brief Used for retrieving the prediction's result.
*/
void get_result();

/**
* @brief Used for retrieving the current values of the network's variables.
* @return Vector of each state value
*/
std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;

/**
* @brief Initializes the tensor values corresponding to the state variables.
* @details The inital values are usually all 0s.
*/
void initialize_states();

private:
/**
* @see ov::ISyncInferRequest
*/
struct FoundPort {
size_t idx;
enum class Type { NOT_FOUND = 0, INPUT, OUTPUT } type;

bool found() {
return type != Type::NOT_FOUND;
}
bool is_input() {
return type == Type::INPUT;
}
bool is_output() {
return type == Type::OUTPUT;
}
};

/**
* @brief Finds input or output port
* @return structure which contains index of Input/Output or report that port wasn't found
* @see ov::ISyncInferRequest
*/
FoundPort find_port(const ov::Output<const ov::Node>& port) const;

/**
* @brief Basic checks for input/output tensor
*
* @param port Input/Output port
* @param tensor Input/Output tensor
*/
void check_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) const;

/**
* @brief Basic checks for input tensors
*
* @param port Input port
* @param tensors Input tensors
*/
void check_batched_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const;

/**
* @brief Check that all tensors are valid. Throws an exception if it's not.
*/
void check_tensors() const override;

/**
* @brief Allocates a tensor on host and stores the reference inside multiple attributes.
* @param descriptor Tensor's metadata
* @param index The index which the allocated tensor shall use.
* @param isInput Determines the containers in which the newly allocated tensors will be stored.
* @param allocator If provided, the tensor uses the custom allocator instead of using the default one.
* @param batchSize If provided, the value of the shape on the 0th axis is overriden with this value.
* @return Pointer towards the allocated tensor
*/
std::shared_ptr<ov::ITensor> allocate_tensor(const IODescriptor& descriptor,
const size_t index,
const bool isInput,
const ov::Allocator& allocator = {},
const std::optional<std::size_t> batchSize = std::nullopt) const;

bool is_batched_input(size_t idx) const;

ov::SoPtr<ov::ITensor>& get_user_input(size_t index) const;
std::vector<ov::SoPtr<ov::ITensor>>& get_user_inputs(size_t index) const;

std::vector<ov::ProfilingInfo> get_profiling_info() const override;

/**
Expand All @@ -55,23 +193,32 @@ class ZeroInferRequest final : public SyncInferRequest {
const size_t index,
const bool isInput);

void check_network_precision(const ov::element::Type_t precision) const override;
/**
* @brief Checks if the provided precision value is supported by the current backend, should throw an error
* otherwise.
* @param precision The precision value to be checked.
*/
void check_network_precision(const ov::element::Type_t precision) const;
void create_pipeline();

std::shared_ptr<ov::ITensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs(size_t index) const;

std::shared_ptr<ov::ITensor> create_tensor(ov::element::Type type,
const ov::Shape& shape,
const ov::Allocator& allocator = {}) const override;
std::shared_ptr<ZeroTensor> create_tensor(ov::element::Type type,
const ov::Shape& shape,
const ov::Allocator& allocator = {}) const;

void add_state(const IODescriptor& descriptor, size_t tensorIndex) const override;
void add_state(const IODescriptor& descriptor, size_t tensorIndex) const;

void update_pipeline_if_memory_changed();
void update_states_if_memory_changed();

const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
const std::shared_ptr<IGraph> _graph;
NetworkMetadata _metadata;
// This is intel_npu::ICompiledModel pointer, but need to use OV base class because
// ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr!
std::shared_ptr<const ov::ICompiledModel> _compiledModel;
const Config _config;
Logger _logger;

Expand All @@ -83,6 +230,12 @@ class ZeroInferRequest final : public SyncInferRequest {
mutable std::vector<std::vector<std::shared_ptr<ov::ITensor>>> _levelZeroInputTensors;
mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroOutputTensors;

// In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed
mutable std::vector<std::vector<ov::SoPtr<ov::ITensor>>> _userInputTensors;
mutable std::vector<ov::SoPtr<ov::ITensor>> _userOutputTensors;

mutable std::vector<ov::SoPtr<ov::IVariableState>> _variableStates;

std::shared_ptr<const zeroMemory::HostMemAllocator> _inputAllocator;
std::shared_ptr<const zeroMemory::HostMemAllocator> _outputAllocator;

Expand All @@ -91,6 +244,16 @@ class ZeroInferRequest final : public SyncInferRequest {
bool _pipelineIsCreated = false;
bool _dynamicBatchValueChanged = false;
bool _externalMemoryStandardAllocationSupported = false;

/**
* @see ov::ISyncInferRequest
*/
mutable std::unordered_map<size_t, FoundPort> _cachedPorts;

/**
* @see ov::ISyncInferRequest
*/
mutable std::mutex _cacheMutex;
};

} // namespace intel_npu
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class HostMemAllocator {
* @param handle Pointer to allocated data
* @return false if handle cannot be released, otherwise - true.
*/
bool deallocate(void* handle, const size_t bytes, size_t alignment = utils::STANDARD_PAGE_SIZE) noexcept;
virtual bool deallocate(void* handle, const size_t bytes, size_t alignment = utils::STANDARD_PAGE_SIZE) noexcept;

bool is_equal(const HostMemAllocator& other) const;

Expand Down
21 changes: 18 additions & 3 deletions src/plugins/intel_npu/src/backend/src/zero_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,23 @@ ov::device::Type ZeroDevice::getDeviceType() const {
return ov::device::Type::INTEGRATED;
}

std::shared_ptr<SyncInferRequest> ZeroDevice::createInferRequest(
std::shared_ptr<ov::IInferRequest> ZeroDevice::createInferRequest(
const std::shared_ptr<const ICompiledModel>& compiledModel,
const Config& config) {
return std::make_shared<ZeroInferRequest>(_initStructs, compiledModel, config);
const Config& config,
std::function<void(void)>& inferAsyncF,
std::function<void(void)>& getResultF) {
auto inferRequest = std::make_shared<ZeroInferRequest>(_initStructs, compiledModel, config);
inferAsyncF = [&inferRequest]() {
if (!inferRequest) {
OPENVINO_THROW("ZeroInferRequest object was destroyed!");
}
inferRequest->infer_async();
};
getResultF = [&inferRequest]() {
if (!inferRequest) {
OPENVINO_THROW("ZeroInferRequest object was destroyed!");
}
inferRequest->get_result();
};
return inferRequest;
}
Loading
Loading