forked from NVIDIA/cutlass
-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathsm100_tile_scheduler_group.hpp
executable file
·309 lines (265 loc) · 11.8 KB
/
sm100_tile_scheduler_group.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include "cutlass/arch/barrier.h"
#include "cutlass/pipeline/pipeline.hpp"
#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
////////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass::gemm::kernel::detail {
//////////////////// Blackwell Grouped Static Scheduler /////////////////////////
// This tile scheduler is a SM100 wrapper for scheduling by the SM90 Group tile scheduler.
// This helps to enable reusing SM90 group tile scheduling capability for SM100 kernels
// (e.g., support for CTA rasterization).
// For Grouped GEMM, most common use case have Problem Shapes for all groups only on device.
// Therefore, we don't how many tiles there will be for the scheduler to hand out.
// Hence, we have a SM90 style static group scheduler that launches the largest grid possible.
// If we had access to host-side problem shapes, one could to use it to figure out the grid shape
// and thereafter use CLC query (which can then be linearized and mapped to an approriate tile coord).
template<class GroupProblemShape>
class PersistentTileSchedulerSm100Group {
public:
using UnderlyingScheduler = PersistentTileSchedulerSm90Group<GroupProblemShape>;
using UnderlyingProblemShape = typename GroupProblemShape::UnderlyingProblemShape;
using Params = PersistentTileSchedulerSm100GroupParams<UnderlyingProblemShape>;
using WorkTileInfo = typename UnderlyingScheduler::WorkTileInfo;
using Arguments = typename UnderlyingScheduler::Arguments;
using RasterOrder = typename Params::RasterOrder;
using RasterOrderOptions = typename Params::RasterOrderOptions;
struct CLCResponse { uint32_t data[4]; };
static constexpr bool IsDynamicPersistent = UnderlyingScheduler::IsDynamicPersistent;
private:
UnderlyingScheduler scheduler_sm90;
public:
template <class TileShape, class AtomThrShape, class ClusterShape>
static Params
to_underlying_arguments(
GroupProblemShape problem_shapes,
TileShape tile_shape_mnk,
AtomThrShape atom_thr_shape_mnk,
ClusterShape cluster_shape_mnk,
KernelHardwareInfo const& hw_info,
Arguments const& args,
void* workspace = nullptr) {
// We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
static_assert(cute::is_static<TileShape>::value);
auto selected_cluster_shape = cutlass::detail::select_cluster_shape(cluster_shape_mnk, hw_info.cluster_shape);
auto cta_shape = cute::conditional_return<not cute::is_static_v<ClusterShape>>(
shape_div(tile_shape_mnk, atom_thr_shape_mnk), // Dynamic Cluster: For 2SM kernels, use CTA tile shape for the underlying scheduler
shape_div(tile_shape_mnk, selected_cluster_shape)); // Static Cluster: Blackwell builders expects TileShape to be Cluster's Tile Shape, Hopper doesn't
dim3 problem_blocks = get_tiled_cta_shape_mnl(
problem_shapes.groups(),
problem_shapes,
hw_info,
cta_shape, selected_cluster_shape);
Params params;
params.initialize(
problem_blocks,
problem_shapes.groups(),
problem_shapes.problem_shapes,
problem_shapes.host_problem_shapes,
to_gemm_coord(cta_shape),
to_gemm_coord(selected_cluster_shape),
hw_info,
args.max_swizzle_size,
args.raster_order
);
return params;
}
static bool
can_implement(Arguments const& args) {
return true;
}
CUTLASS_DEVICE
PersistentTileSchedulerSm100Group() { }
CUTLASS_DEVICE
PersistentTileSchedulerSm100Group(CLCResponse* /* clc_response_ptr */, Params const& params)
: scheduler_params(params),
scheduler_sm90(params.params_sm90_) { }
CUTLASS_DEVICE
PersistentTileSchedulerSm100Group(CLCResponse* /* clc_response_ptr */, Params const& params, dim3 /* block_id_in_cluster */)
: scheduler_params(params),
scheduler_sm90(params.params_sm90_) { }
template <class ClusterShape>
CUTLASS_DEVICE
WorkTileInfo
initial_work_tile_info(ClusterShape cluster_shape) {
return scheduler_sm90.initial_work_tile_info(cluster_shape);
}
template<class BlockShape, class ClusterShape>
CUTLASS_HOST_DEVICE static
dim3
get_tiled_cta_shape_mnl(int groups, GroupProblemShape problem_shapes, KernelHardwareInfo hw_info, BlockShape cta_shape, ClusterShape cluster_shape) {
return UnderlyingScheduler::get_tiled_cta_shape_mnl(groups, problem_shapes, hw_info, cta_shape, cluster_shape);
}
// Given the inputs, computes the physical grid we should launch.
template<class BlockShape, class AtomThrShape, class ClusterShape>
CUTLASS_HOST_DEVICE
static dim3
get_grid_shape(
Params const& params,
GroupProblemShape problem_shapes,
BlockShape cta_shape,
[[maybe_unused]] AtomThrShape atom_thr_shape,
ClusterShape cluster_shape,
KernelHardwareInfo hw_info) {
dim3 problem_blocks = get_tiled_cta_shape_mnl(
problem_shapes.groups(),
problem_shapes,
hw_info,
cta_shape,
cluster_shape);
// Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
Arguments args{};
if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
args.max_swizzle_size = 1 << params.params_sm90_.log_swizzle_size_;
}
args.raster_order = params.params_sm90_.raster_order_ == RasterOrder::AlongN ? RasterOrderOptions::AlongN : RasterOrderOptions::AlongM;
return Params::get_grid_shape(
problem_blocks,
to_gemm_coord(cluster_shape),
hw_info,
args.max_swizzle_size,
args.raster_order,
/* truncate_by_problem_size = */true,
cute::is_static_v<ClusterShape> ? true : false
);
}
CUTLASS_DEVICE
static auto
work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
// SM90 static scheduler implicitly handles CTA coord in a Cluster
return make_coord(
work_tile_info.M_idx,
work_tile_info.N_idx,
_,
work_tile_info.L_idx
);
}
//
// K Tile API
//
template <class ProblemShape, class TileShape, class Shape>
CUTLASS_DEVICE
auto
get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShape problem_shape_MNKL, TileShape tile_shape, Shape) {
auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape));
return cute::make_coord_iterator(k_tiles);
}
// Returns whether the block assigned this work should compute the epilogue for the corresponding
// output tile. For the Group tile scheduler, this is always true.
CUTLASS_HOST_DEVICE
static bool
compute_epilogue(WorkTileInfo const&, Params const&) {
return true;
}
CUTLASS_HOST_DEVICE
static bool
compute_epilogue(WorkTileInfo const&) {
return true;
}
// Returns whether fixup is needed for `work_tile_info`. None of the work units returned by
// this scheduler require fixup, since none of the work units partition the reduction extent.
CUTLASS_HOST_DEVICE
static bool
requires_fixup(Params const& params, WorkTileInfo const work_tile_info) {
return false;
}
// Performs the reduction across splits for a given output tile. No fixup is required for
// work units returned by this scheduler.
template <class FrgTensorC>
CUTLASS_DEVICE
void
fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t, uint32_t = 1) const { }
template <class ProblemShape, class ElementAccumulator>
static size_t
get_workspace_size(Arguments const& args, ProblemShape problem_shape, KernelHardwareInfo const& hw_info, uint32_t, uint32_t = 1, uint32_t = 1) {
return 0;
}
template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape, class ClusterShape>
static size_t
get_workspace_size(Arguments const& args, ProblemShape problem_shape, TileShapeMNK, AtomThrShape, ClusterShape, KernelHardwareInfo const& hw_info,
uint32_t reduction_warp_groups, uint32_t num_accumulator_mtxs = 1) {
return 0;
}
template <class ProblemShape, class TileShape>
CUTLASS_HOST_DEVICE
static int
get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape_MNKL, TileShape tile_shape) {
// All work units returned by this scheduler cover the entire K iteration
// space of the output tile assigned to the work unit.
return cute::size(cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape)));
}
CUTLASS_HOST_DEVICE
static uint32_t
get_work_k_tile_start(WorkTileInfo const&) {
// All work units returned by this scheduler start from K tile 0
return 0u;
}
template <class ProblemShape, class ElementAccumulator>
static cutlass::Status
initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape const&, KernelHardwareInfo const&, uint32_t, uint32_t = 1, uint32_t = 1, CudaHostAdapter *cuda_adapter = nullptr) {
return cutlass::Status::kSuccess;
}
template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape, class ClusterShape>
static cutlass::Status
initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape const&, TileShapeMNK, AtomThrShape, ClusterShape, KernelHardwareInfo const&,
uint32_t, uint32_t = 1, CudaHostAdapter *cuda_adapter = nullptr) {
return cutlass::Status::kSuccess;
}
// Kernel helper function to get next CLC ID
template <class CLCPipeline, class CLCPipelineState>
CUTLASS_DEVICE
auto
fetch_next_work(
WorkTileInfo work_tile_info,
[[maybe_unused]] CLCPipeline& clc_pipeline,
[[maybe_unused]] CLCPipelineState clc_pipe_consumer_state) {
return scheduler_sm90.fetch_next_work(work_tile_info);
}
private:
//
// Methods
//
[[nodiscard]] CUTLASS_DEVICE
static CLCResponse
load_query_response(uint32_t smem_ptr) {
return UnderlyingScheduler::load_query_response(smem_ptr);
}
//
// Storage
//
CLCResponse *clc_response_ptr_ = nullptr;
Params scheduler_params;
};
///////////////////////////////////////////////////////////////////////////////
} // end namespace cutlass::gemm::kernel::detail