forked from oneapi-src/unified-runtime
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel.hpp
171 lines (144 loc) · 5.68 KB
/
kernel.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
//===--------------- kernel.hpp - Native CPU Adapter ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma once
#include "common.hpp"
#include "nativecpu_state.hpp"
#include "program.hpp"
#include <cstring>
#include <ur_api.h>
#include <utility>
using nativecpu_kernel_t = void(void *const *, native_cpu::state *);
using nativecpu_ptr_t = nativecpu_kernel_t *;
using nativecpu_task_t = std::function<nativecpu_kernel_t>;
struct local_arg_info_t {
uint32_t argIndex;
size_t argSize;
local_arg_info_t(uint32_t argIndex, size_t argSize)
: argIndex(argIndex), argSize(argSize) {}
};
struct ur_kernel_handle_t_ : RefCounted {
ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
: Args(other.Args), hProgram(other.hProgram), _name(other._name),
_subhandler(other._subhandler), _localArgInfo(other._localArgInfo),
_localMemPool(other._localMemPool),
_localMemPoolSize(other._localMemPoolSize),
ReqdWGSize(other.ReqdWGSize), NDRangeKernel(other.NDRangeKernel) {
incrementReferenceCount();
}
~ur_kernel_handle_t_() {
if (decrementReferenceCount() == 0) {
free(_localMemPool);
Args.deallocate();
}
}
ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
nativecpu_task_t subhandler,
std::optional<native_cpu::WGSize_t> ReqdWGSize,
std::optional<native_cpu::WGSize_t> MaxWGSize,
std::optional<uint64_t> MaxLinearWGSize,
bool isNDRangeKernel)
: hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize),
MaxLinearWGSize(MaxLinearWGSize), NDRangeKernel(isNDRangeKernel) {}
struct arguments {
using args_index_t = std::vector<void *>;
args_index_t Indices;
std::vector<size_t> ParamSizes;
std::vector<bool> OwnsMem;
static constexpr size_t MaxAlign = 16 * sizeof(double);
/// Add an argument to the kernel.
/// If the argument existed before, it is replaced.
/// Otherwise, it is added.
/// Gaps are filled with empty arguments.
/// Implicit offset argument is kept at the back of the indices collection.
void addArg(size_t Index, size_t Size, const void *Arg) {
if (Index + 1 > Indices.size()) {
Indices.resize(Index + 1);
OwnsMem.resize(Index + 1);
ParamSizes.resize(Index + 1);
// Update the stored value for the argument
Indices[Index] = native_cpu::aligned_malloc(MaxAlign, Size);
OwnsMem[Index] = true;
ParamSizes[Index] = Size;
} else {
if (ParamSizes[Index] != Size) {
Indices[Index] = realloc(Indices[Index], Size);
ParamSizes[Index] = Size;
}
}
std::memcpy(Indices[Index], Arg, Size);
}
void addPtrArg(size_t Index, void *Arg) {
if (Index + 1 > Indices.size()) {
Indices.resize(Index + 1);
OwnsMem.resize(Index + 1);
ParamSizes.resize(Index + 1);
OwnsMem[Index] = false;
ParamSizes[Index] = sizeof(uint8_t *);
}
Indices[Index] = Arg;
}
// This is called by the destructor of ur_kernel_handle_t_, since
// ur_kernel_handle_t_ implements reference counting and we want
// to deallocate only when the reference count is 0.
void deallocate() {
assert(OwnsMem.size() == Indices.size() && "Size mismatch");
for (size_t Index = 0; Index < Indices.size(); Index++) {
if (OwnsMem[Index])
native_cpu::aligned_free(Indices[Index]);
}
}
const args_index_t &getIndices() const noexcept { return Indices; }
} Args;
ur_program_handle_t hProgram;
std::string _name;
nativecpu_task_t _subhandler;
std::vector<local_arg_info_t> _localArgInfo;
std::optional<native_cpu::WGSize_t> getReqdWGSize() const {
return ReqdWGSize;
}
std::optional<native_cpu::WGSize_t> getMaxWGSize() const { return MaxWGSize; }
std::optional<uint64_t> getMaxLinearWGSize() const { return MaxLinearWGSize; }
void updateMemPool(size_t numParallelThreads) {
// compute requested size.
size_t reqSize = 0;
for (auto &entry : _localArgInfo) {
reqSize += entry.argSize * numParallelThreads;
}
if (reqSize == 0 || reqSize == _localMemPoolSize) {
return;
}
// realloc handles nullptr case
_localMemPool = (char *)realloc(_localMemPool, reqSize);
_localMemPoolSize = reqSize;
}
// To be called before executing a work group
void handleLocalArgs(size_t numParallelThread, size_t threadId) {
// For each local argument we have size*numthreads
size_t offset = 0;
for (auto &entry : _localArgInfo) {
Args.Indices[entry.argIndex] =
_localMemPool + offset + (entry.argSize * threadId);
// update offset in the memory pool
offset += entry.argSize * numParallelThread;
}
}
const std::vector<void *> &getArgs() const { return Args.getIndices(); }
void addArg(const void *Ptr, size_t Index, size_t Size) {
Args.addArg(Index, Size, Ptr);
}
void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); }
bool isNDRangeKernel() const { return NDRangeKernel; }
private:
char *_localMemPool = nullptr;
size_t _localMemPoolSize = 0;
std::optional<native_cpu::WGSize_t> ReqdWGSize = std::nullopt;
std::optional<native_cpu::WGSize_t> MaxWGSize = std::nullopt;
std::optional<uint64_t> MaxLinearWGSize = std::nullopt;
const bool NDRangeKernel = false;
};