@@ -25,7 +25,7 @@ void index_kernel(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef
25
25
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4 (kComplexHalf , kHalf , kBool , kBFloat16 ,
26
26
iter.dtype (), " index_cpu" , [&] {
27
27
cpu_index_kernel<scalar_t >(iter, index_size, index_stride, [](char * dst, char * src, int64_t offset) {
28
- *(scalar_t *)dst = *( scalar_t *)(src + offset);
28
+ *(scalar_t *)dst = c10::load (( scalar_t *)(src + offset) );
29
29
});
30
30
});
31
31
}
@@ -128,14 +128,14 @@ void put_kernel(
128
128
// Unlike the non-accumulate case, this needs to be thread-safe.
129
129
cpu_take_put_kernel<scalar_t >(iter, self, true ,
130
130
[](scalar_t & iterated, scalar_t * indexed, const int64_t idx) {
131
- indexed[idx] += iterated;
131
+ indexed[idx] += c10::load (& iterated) ;
132
132
},
133
133
/* serial_execution=*/ true );
134
134
}
135
135
} else {
136
136
cpu_take_put_kernel<scalar_t >(iter, self, true ,
137
137
[](scalar_t & iterated, scalar_t * indexed, const int64_t idx) {
138
- indexed[idx] = iterated;
138
+ indexed[idx] = c10::load (& iterated) ;
139
139
});
140
140
}
141
141
});
@@ -148,7 +148,7 @@ void take_kernel(
148
148
iter.dtype (), " take_cpu" , [&] {
149
149
cpu_take_put_kernel<scalar_t >(iter, input, false ,
150
150
[](scalar_t & iterated, const scalar_t * indexed, const int64_t idx) {
151
- iterated = indexed[idx];
151
+ iterated = c10::load (&( indexed[idx])) ;
152
152
});
153
153
});
154
154
}
@@ -174,12 +174,12 @@ void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
174
174
// TODO: investigate parallelization of the accumulate kernel. Unlike the non-accumulate case,
175
175
// this needs to be thread-safe.
176
176
cpu_index_kernel<scalar_t >(iter, index_size, index_stride, [](char * dst, char * src, int64_t offset) {
177
- *(scalar_t *)(dst + offset) += *( scalar_t *) src;
177
+ *(scalar_t *)(dst + offset) += c10::load ( reinterpret_cast < scalar_t *>( src)) ;
178
178
}, /* serial_execution=*/ true );
179
179
}
180
180
} else {
181
181
cpu_index_kernel<scalar_t >(iter, index_size, index_stride, [](char * dst, char * src, int64_t offset) {
182
- *(scalar_t *)(dst + offset) = *( scalar_t *) src;
182
+ *(scalar_t *)(dst + offset) = c10::load ( reinterpret_cast < scalar_t *>( src)) ;
183
183
}, /* serial_execution=*/ is_deterministic);
184
184
}
185
185
}),
@@ -270,7 +270,7 @@ void index_copy_kernel(
270
270
" index_copy_(): index " , idx, " is out of bounds for dimension " ,
271
271
dim, " with size " , self_dim_size);
272
272
273
- self_data[idx * self_dim_stride] = * source_data;
273
+ self_data[idx * self_dim_stride] = c10::load ( source_data) ;
274
274
275
275
self_data_bytes += strides[0 ];
276
276
index_data_bytes += strides[1 ];
@@ -289,7 +289,7 @@ void index_copy_kernel(
289
289
auto * self_data = reinterpret_cast <scalar_t *>(self_data_bytes);
290
290
auto * source_data = reinterpret_cast <scalar_t *>(source_data_bytes);
291
291
292
- self_data[idx * self_dim_stride] = * source_data;
292
+ self_data[idx * self_dim_stride] = c10::load ( source_data) ;
293
293
294
294
self_data_bytes += strides[0 ];
295
295
source_data_bytes += strides[2 ];
@@ -320,7 +320,7 @@ void cpu_masked_fill_kernel(TensorIterator& iter, scalar_t value) {
320
320
char * dst = data[0 ];
321
321
char * mask = data[1 ];
322
322
for (const auto i : c10::irange (n)) {
323
- bool mask_value = * reinterpret_cast <bool *>(mask + strides[1 ] * i);
323
+ bool mask_value = c10::load ( reinterpret_cast <bool *>(mask + strides[1 ] * i) );
324
324
325
325
if (mask_value) {
326
326
*(scalar_t *)(dst + strides[0 ] * i) = value;
@@ -353,10 +353,11 @@ void cpu_masked_scatter_kernel(TensorIterator& iter, const TensorBase& source) {
353
353
char * mask = data[1 ];
354
354
const int64_t mask_stride = strides[1 ];
355
355
for (const auto i : c10::irange (n)) {
356
- auto mask_value = *reinterpret_cast <bool *>(mask + mask_stride * i);
356
+ auto mask_value = c10::load (reinterpret_cast <bool *>(mask + mask_stride * i));
357
+
357
358
if (mask_value) {
358
359
TORCH_CHECK (source_cntr < numel, " Number of elements of source < number of ones in mask" );
359
- *(scalar_t *)(dst + dst_stride * i) = * (source_ptr);
360
+ *(scalar_t *)(dst + dst_stride * i) = c10::load (source_ptr);
360
361
source_ptr++;
361
362
source_cntr++;
362
363
}
@@ -387,7 +388,7 @@ void cpu_masked_select_serial_kernel(TensorIterator& iter, const func_t& f) {
387
388
char * src = data[1 ];
388
389
char * mask = data[2 ];
389
390
for (const auto i : c10::irange (n)) {
390
- mask_t mask_value = *( mask_t *)(mask + strides[2 ] * i);
391
+ mask_t mask_value = c10::load (( mask_t *)(mask + strides[2 ] * i) );
391
392
if constexpr (!std::is_same_v<mask_t , bool >) {
392
393
TORCH_CHECK (mask_value == 0 || mask_value == 1 , " Mask tensor can take 0 and 1 values only" );
393
394
}
@@ -406,11 +407,11 @@ void masked_select_serial_kernel(TensorIterator& iter, int64_t result_stride) {
406
407
auto mask_dtype = iter.input_dtype (1 );
407
408
if (mask_dtype == ScalarType::Bool) {
408
409
cpu_masked_select_serial_kernel<scalar_t , bool >(iter, [result_stride](char * dst, char * src, int64_t offset) {
409
- *(scalar_t *)(dst + offset*result_stride) = *( scalar_t *)src;
410
+ *(scalar_t *)(dst + offset*result_stride) = c10::load (( scalar_t *)src) ;
410
411
});
411
412
} else {
412
413
cpu_masked_select_serial_kernel<scalar_t , unsigned char >(iter, [result_stride](char * dst, char * src, int64_t offset) {
413
- *(scalar_t *)(dst + offset*result_stride) = *( scalar_t *)src;
414
+ *(scalar_t *)(dst + offset*result_stride) = c10::load (( scalar_t *)src) ;
414
415
});
415
416
}
416
417
}),
@@ -430,7 +431,7 @@ void cpu_masked_select_kernel(TensorIterator& iter, const func_t& f) {
430
431
char * mask = data[2 ];
431
432
char * mask_prefix_sum = data[3 ];
432
433
for (const auto i : c10::irange (n)) {
433
- mask_t mask_value = *( mask_t *)(mask + strides[2 ] * i);
434
+ mask_t mask_value = c10::load (( mask_t *)(mask + strides[2 ] * i) );
434
435
if constexpr (!std::is_same_v<mask_t , bool >) {
435
436
TORCH_CHECK (mask_value == 0 || mask_value == 1 , " Mask tensor can take 0 and 1 values only" );
436
437
}
@@ -449,7 +450,7 @@ void masked_select_kernel(TensorIterator& iter, int64_t result_stride) {
449
450
auto mask_dtype = iter.input_dtype (1 );
450
451
if (mask_dtype == ScalarType::Bool) {
451
452
cpu_masked_select_kernel<scalar_t , bool >(iter, [result_stride](char * dst, char * src, int64_t offset) {
452
- *(scalar_t *)(dst + offset*result_stride) = *( scalar_t *)src;
453
+ *(scalar_t *)(dst + offset*result_stride) = c10::load (( scalar_t *)src) ;
453
454
});
454
455
} else {
455
456
cpu_masked_select_kernel<scalar_t , unsigned char >(iter, [result_stride](char * dst, char * src, int64_t offset) {
@@ -501,7 +502,7 @@ void cpu_hflip_vec(at::TensorIterator& iter) {
501
502
offset = (offset >= n) ? n : offset;
502
503
for (; i < offset; i++) {
503
504
scalar_t * out_ptr = (scalar_t *)(data[0 ] - i * stride);
504
- *out_ptr = *( scalar_t *)(data[1 ] + i * stride);
505
+ *out_ptr = c10::load (( scalar_t *)(data[1 ] + i * stride) );
505
506
}
506
507
// Empirically found that it is faster to process 3 data items together vs 2 or 4
507
508
for (; i <= n - 3 * Vec::size (); i += 3 * Vec::size ()) {
@@ -519,7 +520,7 @@ void cpu_hflip_vec(at::TensorIterator& iter) {
519
520
if (i < n) {
520
521
for (; i < n; i++) {
521
522
scalar_t * out_ptr = (scalar_t *)(data[0 ] - i * stride);
522
- *out_ptr = *( scalar_t *)(data[1 ] + i * stride);
523
+ *out_ptr = c10::load (( scalar_t *)(data[1 ] + i * stride) );
523
524
}
524
525
}
525
526
0 commit comments