8
8
#include < ze_graph_ext.h>
9
9
10
10
#include " intel_npu/common/npu.hpp"
11
- #include " intel_npu/common/sync_infer_request.hpp"
12
11
#include " intel_npu/utils/logger/logger.hpp"
13
12
#include " intel_npu/utils/zero/zero_remote_tensor.hpp"
14
13
#include " intel_npu/utils/zero/zero_utils.hpp"
18
17
19
18
namespace intel_npu {
20
19
21
- class ZeroInferRequest final : public SyncInferRequest {
20
+ class ZeroInferRequest final : public ov::IInferRequest {
22
21
public:
23
22
explicit ZeroInferRequest (const std::shared_ptr<ZeroInitStructsHolder>& initStructs,
24
23
const std::shared_ptr<const ICompiledModel>& compiledModel,
25
24
const Config& config);
26
25
26
+ /* *
27
+ * @brief Gets an input/output tensor for inference.
28
+ * @note If the tensor with the specified @p port is not found, an exception is thrown.
29
+ * @param port Port of the tensor to get.
30
+ * @return Tensor for the port @p port.
31
+ */
27
32
ov::SoPtr<ov::ITensor> get_tensor (const ov::Output<const ov::Node>& port) const override ;
33
+
34
+ /* *
35
+ * @brief Sets an input/output tensor to infer.
36
+ * @param port Port of the input or output tensor.
37
+ * @param tensor Reference to a tensor. The element_type and shape of a tensor must match
38
+ * the model's input/output element_type and size.
39
+ */
28
40
void set_tensor (const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override ;
41
+
42
+ /* *
43
+ * @brief Gets an input/output tensor for inference.
44
+ * @note If the tensor with the specified @p port is not found, am exception is thrown.
45
+ * @param port Port of the batched tensors to get.
46
+ * @return Vector of batched tensors for the input port @p port or empty vector if port is output.
47
+ */
48
+ std::vector<ov::SoPtr<ov::ITensor>> get_tensors (const ov::Output<const ov::Node>& port) const override ;
49
+
50
+ /* *
51
+ * @brief Sets batched input tensors to infer
52
+ * @param port Port of the batched input tensor.
53
+ * @param tensors Vector of references to batched tensors. The element_type and shape of each must match.
54
+ * @note Batched tensors for outputs is not supported.
55
+ * @note If single element vector is provided for @p tensors param, fallback to "set_tensor" function will occur.
56
+ */
29
57
void set_tensors (const ov::Output<const ov::Node>& port,
30
58
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override ;
31
59
60
+ /* *
61
+ * @brief Gets inputs for infer request
62
+ *
63
+ * @return vector of input ports
64
+ */
65
+ const std::vector<ov::Output<const ov::Node>>& get_inputs () const override ;
66
+
67
+ /* *
68
+ * @brief Gets outputs for infer request
69
+ *
70
+ * @return vector of output ports
71
+ */
72
+ const std::vector<ov::Output<const ov::Node>>& get_outputs () const override ;
73
+
74
+ /* *
75
+ * @brief Gets pointer to compiled model (usually synchronous request holds the compiled model)
76
+ *
77
+ * @return Pointer to the compiled model
78
+ */
79
+ const std::shared_ptr<const ov::ICompiledModel>& get_compiled_model () const override ;
80
+
81
+ /* *
82
+ * @brief Calls "infer_async" then "get_result"
83
+ */
32
84
void infer () override ;
33
- void infer_async () override ;
34
85
35
- void get_result () override ;
86
+ /* *
87
+ * @brief Used for executing the inference.
88
+ */
89
+ void infer_async ();
90
+
91
+ /* *
92
+ * @brief Used for retrieving the prediction's result.
93
+ */
94
+ void get_result ();
95
+
96
+ /* *
97
+ * @brief Used for retrieving the current values of the network's variables.
98
+ * @return Vector of each state value
99
+ */
100
+ std::vector<ov::SoPtr<ov::IVariableState>> query_state () const override ;
101
+
102
+ /* *
103
+ * @brief Initializes the tensor values corresponding to the state variables.
104
+ * @details The inital values are usually all 0s.
105
+ */
106
+ void initialize_states ();
36
107
37
108
private:
109
+ /* *
110
+ * @see ov::ISyncInferRequest
111
+ */
112
+ struct FoundPort {
113
+ size_t idx;
114
+ enum class Type { NOT_FOUND = 0 , INPUT, OUTPUT } type;
115
+
116
+ bool found () {
117
+ return type != Type::NOT_FOUND;
118
+ }
119
+ bool is_input () {
120
+ return type == Type::INPUT;
121
+ }
122
+ bool is_output () {
123
+ return type == Type::OUTPUT;
124
+ }
125
+ };
126
+
127
+ /* *
128
+ * @brief Finds input or output port
129
+ * @return structure which contains index of Input/Output or report that port wasn't found
130
+ * @see ov::ISyncInferRequest
131
+ */
132
+ FoundPort find_port (const ov::Output<const ov::Node>& port) const ;
133
+
134
+ /* *
135
+ * @brief Basic checks for input/output tensor
136
+ *
137
+ * @param port Input/Output port
138
+ * @param tensor Input/Output tensor
139
+ */
140
+ void check_tensor (const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) const ;
141
+
142
+ /* *
143
+ * @brief Basic checks for input tensors
144
+ *
145
+ * @param port Input port
146
+ * @param tensors Input tensors
147
+ */
148
+ void check_batched_tensors (const ov::Output<const ov::Node>& port,
149
+ const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const ;
150
+
151
+ /* *
152
+ * @brief Check that all tensors are valid. Throws an exception if it's not.
153
+ */
154
+ void check_tensors () const override ;
155
+
156
+ /* *
157
+ * @brief Allocates a tensor on host and stores the reference inside multiple attributes.
158
+ * @param descriptor Tensor's metadata
159
+ * @param index The index which the allocated tensor shall use.
160
+ * @param isInput Determines the containers in which the newly allocated tensors will be stored.
161
+ * @param allocator If provided, the tensor uses the custom allocator instead of using the default one.
162
+ * @param batchSize If provided, the value of the shape on the 0th axis is overriden with this value.
163
+ * @return Pointer towards the allocated tensor
164
+ */
165
+ std::shared_ptr<ov::ITensor> allocate_tensor (const IODescriptor& descriptor,
166
+ const size_t index,
167
+ const bool isInput,
168
+ const ov::Allocator& allocator = {},
169
+ const std::optional<std::size_t > batchSize = std::nullopt) const ;
170
+
171
+ bool is_batched_input (size_t idx) const ;
172
+
173
+ ov::SoPtr<ov::ITensor>& get_user_input (size_t index) const ;
174
+ std::vector<ov::SoPtr<ov::ITensor>>& get_user_inputs (size_t index) const ;
175
+
38
176
std::vector<ov::ProfilingInfo> get_profiling_info () const override ;
39
177
40
178
/* *
@@ -55,23 +193,32 @@ class ZeroInferRequest final : public SyncInferRequest {
55
193
const size_t index,
56
194
const bool isInput);
57
195
58
- void check_network_precision (const ov::element::Type_t precision) const override ;
196
+ /* *
197
+ * @brief Checks if the provided precision value is supported by the current backend, should throw an error
198
+ * otherwise.
199
+ * @param precision The precision value to be checked.
200
+ */
201
+ void check_network_precision (const ov::element::Type_t precision) const ;
59
202
void create_pipeline ();
60
203
61
204
std::shared_ptr<ov::ITensor>& get_level_zero_input (size_t index, size_t tensorNo = 0 ) const ;
62
205
std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs (size_t index) const ;
63
206
64
- std::shared_ptr<ov::ITensor > create_tensor (ov::element::Type type,
65
- const ov::Shape& shape,
66
- const ov::Allocator& allocator = {}) const override ;
207
+ std::shared_ptr<ZeroTensor > create_tensor (ov::element::Type type,
208
+ const ov::Shape& shape,
209
+ const ov::Allocator& allocator = {}) const ;
67
210
68
- void add_state (const IODescriptor& descriptor, size_t tensorIndex) const override ;
211
+ void add_state (const IODescriptor& descriptor, size_t tensorIndex) const ;
69
212
70
213
void update_pipeline_if_memory_changed ();
71
214
void update_states_if_memory_changed ();
72
215
73
216
const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
74
217
const std::shared_ptr<IGraph> _graph;
218
+ NetworkMetadata _metadata;
219
+ // This is intel_npu::ICompiledModel pointer, but need to use OV base class because
220
+ // ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr!
221
+ std::shared_ptr<const ov::ICompiledModel> _compiledModel;
75
222
const Config _config;
76
223
Logger _logger;
77
224
@@ -83,6 +230,12 @@ class ZeroInferRequest final : public SyncInferRequest {
83
230
mutable std::vector<std::vector<std::shared_ptr<ov::ITensor>>> _levelZeroInputTensors;
84
231
mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroOutputTensors;
85
232
233
+ // In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed
234
+ mutable std::vector<std::vector<ov::SoPtr<ov::ITensor>>> _userInputTensors;
235
+ mutable std::vector<ov::SoPtr<ov::ITensor>> _userOutputTensors;
236
+
237
+ mutable std::vector<ov::SoPtr<ov::IVariableState>> _variableStates;
238
+
86
239
std::shared_ptr<const zeroMemory::HostMemAllocator> _inputAllocator;
87
240
std::shared_ptr<const zeroMemory::HostMemAllocator> _outputAllocator;
88
241
@@ -91,6 +244,16 @@ class ZeroInferRequest final : public SyncInferRequest {
91
244
bool _pipelineIsCreated = false ;
92
245
bool _dynamicBatchValueChanged = false ;
93
246
bool _externalMemoryStandardAllocationSupported = false ;
247
+
248
+ /* *
249
+ * @see ov::ISyncInferRequest
250
+ */
251
+ mutable std::unordered_map<size_t , FoundPort> _cachedPorts;
252
+
253
+ /* *
254
+ * @see ov::ISyncInferRequest
255
+ */
256
+ mutable std::mutex _cacheMutex;
94
257
};
95
258
96
259
} // namespace intel_npu
0 commit comments