Skip to content

Commit e997452

Browse files
committed
avx2 update
1 parent 459ef00 commit e997452

File tree

3 files changed

+11
-11
lines changed

3 files changed

+11
-11
lines changed

week08/Lab08.pptx

-73.4 KB
Binary file not shown.

week08/examples/main.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ using namespace std;
1515
int main(int argc, char ** argv)
1616
{
1717
size_t nSize = 200000000;
18-
//float * p1 = new float[nSize](); //the memory is not aligned
19-
//float * p2 = new float[nSize](); //the memory is not aligned
18+
float * p1 = new float[nSize](); //the memory is not aligned
19+
float * p2 = new float[nSize](); //the memory is not aligned
2020

21-
//256bits aligned, C++17 standard
22-
float * p1 = static_cast<float*>(aligned_alloc(256, nSize*sizeof(float)));
23-
float * p2 = static_cast<float*>(aligned_alloc(256, nSize*sizeof(float)));
21+
// // 256bits aligned, C++17 standard
22+
// float * p1 = static_cast<float*>(aligned_alloc(256, nSize*sizeof(float)));
23+
// float * p2 = static_cast<float*>(aligned_alloc(256, nSize*sizeof(float)));
2424
float result = 0.0f;
2525

2626
p1[2] = 2.3f;

week08/examples/matoperation.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@ float dotproduct_avx2(const float *p1, const float * p2, size_t n)
6262

6363
for (size_t i = 0; i < n; i+=8)
6464
{
65-
a = _mm256_load_ps(p1 + i);
66-
b = _mm256_load_ps(p2 + i);
65+
a = _mm256_loadu_ps(p1 + i);
66+
b = _mm256_loadu_ps(p2 + i);
6767
c = _mm256_add_ps(c, _mm256_mul_ps(a, b));
6868
}
69-
_mm256_store_ps(sum, c);
69+
_mm256_storeu_ps(sum, c);
7070
return (sum[0]+sum[1]+sum[2]+sum[3]+sum[4]+sum[5]+sum[6]+sum[7]);
7171
#else
7272
std::cerr << "AVX2 is not supported" << std::endl;
@@ -90,11 +90,11 @@ float dotproduct_avx2_omp(const float *p1, const float * p2, size_t n)
9090
#pragma omp parallel for
9191
for (size_t i = 0; i < n; i+=8)
9292
{
93-
a = _mm256_load_ps(p1 + i);
94-
b = _mm256_load_ps(p2 + i);
93+
a = _mm256_loadu_ps(p1 + i);
94+
b = _mm256_loadu_ps(p2 + i);
9595
c = _mm256_add_ps(c, _mm256_mul_ps(a, b));
9696
}
97-
_mm256_store_ps(sum, c);
97+
_mm256_storeu_ps(sum, c);
9898
return (sum[0]+sum[1]+sum[2]+sum[3]+sum[4]+sum[5]+sum[6]+sum[7]);
9999
#else
100100
std::cerr << "AVX2 is not supported" << std::endl;

0 commit comments

Comments
 (0)