Skip to content

Commit 4c248f6

Browse files
[GPU] Optimize node's memdeps to reduce memory footprints. (#29237)
[GPU] Reduce memory footprint by optimizing node's memdeps. std::unordered_set<size_t> was originally used as program_node's memory_dependency and primitive_inst's runtime_memory_dependency for a better memory pool performance efficiency, however they are less memory efficient. This optimization takes advantage that runtime memdeps (of pritimive_inst instances) is initialized from compile stage memdeps (of program_node instances) and only appends to it during some runtime skip passes. ### Details: - [x] Change memdeps set from size_t to uint32_t - [x] Reserve unordered_set memory in Serializer to reduce memory overhead of unordered_set when importing from cache_dir - [x] Reduce unnecessary memory dependencies, such as for constant nodes - [x] Reduce memory dependencies of ReadValue nodes when they are optimized out by reusing Variable's memory. - [x] Split "initial memory dependencies" and "runtime" ones to reduce overheads in program_node and primitive_instance ### Tickets: - *CVS-160820, CVS-163673*
1 parent 3423040 commit 4c248f6

File tree

12 files changed

+91
-76
lines changed

12 files changed

+91
-76
lines changed

src/plugins/intel_gpu/include/intel_gpu/graph/serialization/set_serializer.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class Serializer<BufferType, std::unordered_set<T>, typename std::enable_if<std:
5555
static void load(BufferType& buffer, std::unordered_set<T>& set) {
5656
typename std::unordered_set<T>::size_type set_size = 0UL;
5757
buffer >> set_size;
58-
58+
if (set.empty()) set.reserve(set_size);
5959
for (long unsigned int i = 0; i < set_size; i++) {
6060
T el;
6161
buffer >> el;

src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp

+45-9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,47 @@ class engine;
2626
using primitive_id = std::string;
2727
using memory_ptr = std::shared_ptr<memory>;
2828

29+
template<typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>>
30+
class memory_restricter {
31+
private:
32+
const std::unordered_set<Key, Hash, KeyEqual>* set1; // Const reference to immutable set
33+
std::unordered_set<Key, Hash, KeyEqual> set2; // Internal mutable set
34+
35+
public:
36+
memory_restricter() : set1(nullptr) {};
37+
38+
// Constructor to initialize with a const reference for set1
39+
explicit memory_restricter(const std::unordered_set<Key, Hash, KeyEqual>* externalSet)
40+
: set1(externalSet) {}
41+
42+
// Insert into set2 (set1 is read-only)
43+
void insert(const Key& key) {
44+
if (set1->find(key) == set1->end())
45+
set2.insert(key);
46+
}
47+
48+
// Check existence in either set
49+
bool contains(const Key& key) const {
50+
return set1->find(key) != set1->end() || set2.find(key) != set2.end();
51+
}
52+
53+
// Total size of both sets
54+
size_t size() const {
55+
return set1->size() + set2.size();
56+
}
57+
58+
// Check if both sets are empty
59+
bool empty() const {
60+
return set1->empty() && set2.empty();
61+
}
62+
63+
// Iterate over both sets
64+
void for_each(void(*func)(const Key&)) const {
65+
for (const auto& key : set1) func(key);
66+
for (const auto& key : set2) func(key);
67+
}
68+
}; // end of memory_restricter
69+
2970
struct memory_user {
3071
size_t _unique_id;
3172
uint32_t _network_id;
@@ -112,7 +153,7 @@ struct padded_pool_comparer {
112153

113154
class memory_pool {
114155
memory_ptr alloc_memory(const layout& layout, allocation_type type, bool reset = true);
115-
static bool has_conflict(const memory_set&, const std::unordered_set<size_t>&, uint32_t network_id);
156+
static bool has_conflict(const memory_set&, const memory_restricter<uint32_t>&);
116157

117158
std::multimap<uint64_t, memory_record> _non_padded_pool;
118159
std::map<layout, std::list<memory_record>, padded_pool_comparer> _padded_pool;
@@ -127,7 +168,7 @@ class memory_pool {
127168
const primitive_id& id,
128169
size_t unique_id,
129170
uint32_t network_id,
130-
const std::unordered_set<size_t>& restrictions,
171+
const memory_restricter<uint32_t>& restrictions,
131172
allocation_type type,
132173
bool reusable = true,
133174
bool reset = true,
@@ -137,21 +178,16 @@ class memory_pool {
137178
const primitive_id& prim_id,
138179
size_t unique_id,
139180
uint32_t network_id,
140-
const std::unordered_set<size_t>&,
181+
const memory_restricter<uint32_t>&,
141182
allocation_type type,
142183
bool reset = true,
143184
bool is_dynamic = false);
144185
memory_ptr get_from_padded_pool(const layout& layout,
145186
const primitive_id& prim_id,
146187
size_t unique_id,
147188
uint32_t network_id,
148-
const std::unordered_set<size_t>& restrictions,
189+
const memory_restricter<uint32_t>& restrictions,
149190
allocation_type type);
150-
memory_ptr get_from_across_networks_pool(const layout& layout,
151-
const primitive_id& id,
152-
size_t unique_id,
153-
uint32_t network_id,
154-
allocation_type type);
155191
void clear_pool_for_network(uint32_t network_id);
156192
void release_memory(memory* memory, const size_t& unique_id, primitive_id prim_id, uint32_t network_id);
157193

src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -288,10 +288,10 @@ void prepare_quantization::prepare_scale_shift_opt(program &p, quantize_node& qu
288288
p.add_connection(in_shift_node, new_quantize_node);
289289
p.add_connection(out_scale_node, new_quantize_node);
290290
p.add_connection(out_shift_node, new_quantize_node);
291-
new_quantize_node.add_memory_dependency(in_scale_node.get_unique_id());
292-
new_quantize_node.add_memory_dependency(in_shift_node.get_unique_id());
293-
new_quantize_node.add_memory_dependency(out_scale_node.get_unique_id());
294-
new_quantize_node.add_memory_dependency(out_shift_node.get_unique_id());
291+
new_quantize_node.add_memory_dependency(in_scale_node);
292+
new_quantize_node.add_memory_dependency(in_shift_node);
293+
new_quantize_node.add_memory_dependency(out_scale_node);
294+
new_quantize_node.add_memory_dependency(out_shift_node);
295295
p.get_processing_order().insert(&new_quantize_node, &in_shift_node);
296296
p.get_processing_order().insert(&new_quantize_node, &in_scale_node);
297297
p.get_processing_order().insert(&new_quantize_node, &out_shift_node);

src/plugins/intel_gpu/src/graph/graph_optimizer/skipped_branch_memory_dependencies.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ void skipped_branch_memory_dependencies::run(program& p) {
1919
while (itrB != processing_order.end()) {
2020
auto& nodeB = *itrB;
2121
auto itrA = ++itrB;
22-
if (nodeB->is_constant())
22+
if (!nodeB->may_use_mempool())
2323
continue;
2424
if (nodeB->get_users().size() == 0)
2525
continue;

src/plugins/intel_gpu/src/graph/include/pass_manager.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -322,10 +322,10 @@ class memory_dependency_pass : public base_pass {
322322

323323
if ((!dep->can_be_optimized() || !dep->is_runtime_skippable()) && ((node->can_be_optimized() && !node->is_runtime_skippable())
324324
|| !dep->can_be_optimized())) {
325-
node->add_memory_dependency(static_cast<int32_t>(dep->get_unique_id()));
325+
node->add_memory_dependency(*dep);
326326
} else {
327327
if (node->is_runtime_skippable() || dep->is_runtime_skippable() || dep->can_be_optimized()) {
328-
node->add_memory_dependency(static_cast<int32_t>(dep->get_unique_id()));
328+
node->add_memory_dependency(*dep);
329329
}
330330

331331
for (const auto& subdep : dep->get_dependencies()) {

src/plugins/intel_gpu/src/graph/include/primitive_inst.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ class primitive_inst {
222222
_users = _network.get_primitives(users);
223223
}
224224

225-
const std::unordered_set<size_t>& get_runtime_memory_dependencies() const { return _runtime_memory_dependencies; }
225+
const memory_restricter<uint32_t>& get_runtime_memory_dependencies() const { return _runtime_memory_dependencies; }
226226

227227
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
228228
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
@@ -307,7 +307,7 @@ class primitive_inst {
307307
memory_pool& pool,
308308
const program_node& _node,
309309
const kernel_impl_params& impl_params,
310-
const std::unordered_set<size_t>& memory_dependencies,
310+
const memory_restricter<uint32_t>& memory_dependencies,
311311
uint32_t net_id,
312312
bool is_internal,
313313
size_t idx = 0,
@@ -379,7 +379,7 @@ class primitive_inst {
379379
std::vector<primitive_inst*> _exec_deps;
380380

381381
// List of primitive ids that this primitive can't share memory buffers with
382-
std::unordered_set<size_t> _runtime_memory_dependencies;
382+
memory_restricter<uint32_t> _runtime_memory_dependencies;
383383

384384
// This is sub-network generated on demand to execute unfused primitives sequence instead of single fused primitive
385385
// Needed for dynamic path only, as fusion in some cases may be illegal, but it can't be checked on program build phase,

src/plugins/intel_gpu/src/graph/include/program_node.h

+10-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "intel_gpu/graph/fused_primitive_desc.hpp"
1313
#include "intel_gpu/graph/kernel_impl_params.hpp"
1414
#include "intel_gpu/primitives/reorder.hpp"
15+
#include "intel_gpu/primitives/read_value.hpp"
1516
#include "intel_gpu/runtime/utils.hpp"
1617

1718
#include <set>
@@ -207,9 +208,15 @@ struct program_node {
207208
size_t get_dependency_index(const program_node& node) const;
208209
size_t get_user_index(const program_node& node) const;
209210

210-
std::unordered_set<size_t> get_memory_dependencies() const;
211-
void add_memory_dependency(size_t);
211+
const std::unordered_set<uint32_t>& get_memory_dependencies() const;
212+
212213
void add_memory_dependency(std::vector<size_t>);
214+
void add_memory_dependency(const program_node& node);
215+
216+
// At least the following scenarios are not allocating from memory pool:
217+
// 1. constant nodes
218+
// 2. read_value nodes that are optimized out to reuse from Variables.
219+
bool may_use_mempool() const { return !(is_constant() || (is_type<read_value>() && optimized)); }
213220

214221
template <class PType>
215222
bool have_user_with_type() const {
@@ -497,7 +504,7 @@ struct program_node {
497504
std::list<program_node*> users;
498505

499506
// list of primitives that can reuse same memory buffers due to execution order conflicts
500-
std::unordered_set<size_t> memory_dependencies;
507+
std::unordered_set<uint32_t> memory_dependencies;
501508

502509
impl_types impl_type = impl_types::any;
503510
impl_types forced_impl_type = impl_types::any;

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ static memory::ptr get_memory_from_pool(engine& _engine,
157157
const layout& layout,
158158
allocation_type type,
159159
bool reusable_across_network,
160-
const std::unordered_set<size_t>& memory_dependencies,
160+
const memory_restricter<uint32_t>& memory_dependencies,
161161
bool reset = true,
162162
memory* curr_memory = nullptr) {
163163
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(),
@@ -1340,7 +1340,7 @@ void primitive_inst::do_runtime_skip_reorder() {
13401340
update_memory_dependencies = [&](std::vector<primitive_inst*> users) {
13411341
for (auto& user : users) {
13421342
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] add " << id() << " to restriction list of " << user->id() << std::endl;
1343-
user->_runtime_memory_dependencies.insert(get_node().get_unique_id());
1343+
user->_runtime_memory_dependencies.insert(static_cast<uint32_t>(get_node().get_unique_id()));
13441344
if (user->can_be_optimized())
13451345
update_memory_dependencies(user->get_user_insts());
13461346
}
@@ -2085,7 +2085,7 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool
20852085
, _use_shared_kernels(node.get_program().get_config().get_enable_kernels_reuse())
20862086
, _impl_params(node.get_kernel_impl_params())
20872087
, _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr)
2088-
, _runtime_memory_dependencies(node.get_memory_dependencies())
2088+
, _runtime_memory_dependencies(&node.get_memory_dependencies())
20892089
, _outputs({})
20902090
, _reordered_weights_cache(network.get_weights_cache_capacity())
20912091
, _is_dynamic(node.is_dynamic())
@@ -2390,7 +2390,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
23902390
memory_pool& pool,
23912391
const program_node& _node,
23922392
const kernel_impl_params& impl_params,
2393-
const std::unordered_set<size_t>& memory_dependencies,
2393+
const memory_restricter<uint32_t>& memory_dependencies,
23942394
uint32_t net_id,
23952395
bool is_internal,
23962396
size_t idx,

src/plugins/intel_gpu/src/graph/program.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -748,7 +748,7 @@ void program::prepare_memory_dependencies() {
748748
if (!_config.get_enable_memory_pool())
749749
return;
750750
for (auto& node : get_processing_order()) {
751-
node->add_memory_dependency(node->get_unique_id());
751+
node->add_memory_dependency(*node);
752752
}
753753
apply_opt_pass<basic_memory_dependencies>();
754754
apply_opt_pass<skipped_branch_memory_dependencies>();
@@ -766,7 +766,7 @@ std::string program::get_memory_dependencies_string() const {
766766
.append("(unique_id:")
767767
.append(std::to_string(node->get_unique_id()))
768768
.append(") restricted list: ");
769-
for (auto it : node->get_memory_dependencies())
769+
for (const auto& it : node->get_memory_dependencies())
770770
mem_dep = mem_dep.append(std::to_string(it)).append(",");
771771
mem_dep = mem_dep.append("\n");
772772
}
@@ -1715,7 +1715,7 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
17151715
pool,
17161716
*node,
17171717
*node->get_kernel_impl_params(),
1718-
node->get_memory_dependencies(),
1718+
memory_restricter<uint32_t>(&node->get_memory_dependencies()),
17191719
0,
17201720
false,
17211721
0,

src/plugins/intel_gpu/src/graph/program_node.cpp

+9-4
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,17 @@ void program_node::remove_dependency(size_t idx) {
195195
dependencies.erase(dependencies.begin() + idx);
196196
}
197197

198-
std::unordered_set<size_t> program_node::get_memory_dependencies() const { return memory_dependencies; }
199-
200-
void program_node::add_memory_dependency(size_t prim) { memory_dependencies.insert(prim); }
198+
const std::unordered_set<uint32_t>& program_node::get_memory_dependencies() const { return memory_dependencies; }
201199

202200
void program_node::add_memory_dependency(std::vector<size_t> prim_list) {
203-
memory_dependencies.insert(prim_list.begin(), prim_list.end());
201+
for (size_t val : prim_list) {
202+
memory_dependencies.insert(static_cast<uint32_t>(val));
203+
}
204+
}
205+
206+
void program_node::add_memory_dependency(const program_node& dep) {
207+
if (dep.may_use_mempool() && may_use_mempool())
208+
memory_dependencies.insert(static_cast<uint32_t>(dep.get_unique_id()));
204209
}
205210

206211
std::unique_ptr<json_composite> program_node::desc_to_json() const {

0 commit comments

Comments
 (0)