Skip to content

Commit d546091

Browse files
committed
Squashed commit of the following:
commit 66abeaf Author: Carl Pearson <[email protected]> Date: Mon Apr 6 13:03:49 2020 -0500 . commit 7cc7288 Author: Carl Pearson <[email protected]> Date: Mon Apr 6 11:29:04 2020 -0500 . commit 547d794 Author: Carl Pearson <[email protected]> Date: Mon Apr 6 11:04:08 2020 -0500 . commit 15db5cf Author: Carl Pearson <[email protected]> Date: Mon Apr 6 07:46:41 2020 -0500 . commit 83e4e4f Author: Carl Pearson <[email protected]> Date: Mon Apr 6 07:39:01 2020 -0500 cuda 10.1 commit 9bf48c6 Author: Carl Pearson <[email protected]> Date: Fri Apr 3 08:12:21 2020 -0500 . commit 48d957e Author: Carl Pearson <[email protected]> Date: Fri Apr 3 07:54:28 2020 -0500 remove install files commit efc8283 Merge: cc817f8 dfbe32f Author: Carl Pearson <[email protected]> Date: Thu Apr 2 13:40:04 2020 -0500 Merge branch 'sgemm' of github.com:cwpearson/nvidia-performance-tools into sgemm commit cc817f8 Author: Carl Pearson <[email protected]> Date: Thu Apr 2 13:39:58 2020 -0500 sgemm working commit d2e5b26 Author: Carl Pearson <[email protected]> Date: Thu Apr 2 13:29:11 2020 -0500 cpu test commit dfbe32f Author: Carl Pearson <[email protected]> Date: Thu Apr 2 07:58:12 2020 -0500 add rai_build commit 468757f Author: Carl Pearson <[email protected]> Date: Thu Apr 2 07:40:29 2020 -0500 . commit cc2ace8 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:44:48 2020 -0500 . commit b205b6a Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:44:27 2020 -0500 . commit 141a794 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:31:04 2020 -0500 . commit 7c6caff Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:25:30 2020 -0500 . commit 0b4dd5d Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:24:02 2020 -0500 . commit 8bdf7ce Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:19:41 2020 -0500 add sgemm to travis commit 108561d Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:12:42 2020 -0500 travis commit ac7fe77 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:08:18 2020 -0500 switch row/col major commit e5789b3 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:00:45 2020 -0500 all sgemms compile commit 5c782a6 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 16:39:33 2020 -0500 steal tiled code from webgpu commit 34775a4 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 15:20:18 2020 -0500 add basic sgemm impl, update argparse commit 05a321c Author: Carl Pearson <[email protected]> Date: Wed Apr 1 10:16:56 2020 -0500 basic sgemm kernel and copy regtiled from 508
1 parent f80c8e7 commit d546091

18 files changed

+1000
-34
lines changed

.dockerignore

Whitespace-only changes.

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
sgemm/build
2+
*.deb
3+
*.run

.travis.yml

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
dist: bionic
22
language: minimal
3-
arch:
4-
- ppc64le
5-
- amd64
63

7-
docker: true
4+
jobs:
5+
include:
6+
- arch: ppc64le
7+
env: BUILD_DOCKER=1
8+
docker: true
9+
- arch: amd64
10+
env: BUILD_DOCKER=1
11+
docker: true
12+
- env: BUILD_TYPE=Release
13+
- env: BUILD_TYPE=Debug
814

915
before_script:
1016
- ci/install_deps.sh || travis_terminate 1;

README.md

+29-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,23 @@ docker pull cwpearson/nvidia-performance-tools/latest-ppc64le
1515

1616
[ECE 408 Spring 2020 - Introduction to Nvidia Performance Tools](https://docs.google.com/presentation/d/1A5i3Zdh7ltOLdW7qHZ2tviXYcyl1sKvM7kRpnzOD7tQ/edit?usp=sharing)
1717

18+
## nvcc
19+
20+
```
21+
--profile (-pg)
22+
Instrument generated code/executable for use by gprof (Linux only).
23+
24+
--debug (-g)
25+
Generate debug information for host code.
26+
27+
--device-debug (-G)
28+
Generate debug information for device code. Turns off all optimizations.
29+
Don't use for profiling; use -lineinfo instead.
30+
31+
--generate-line-info (-lineinfo)
32+
Generate line-number information for device code.
33+
```
34+
1835
## Nsight Compute
1936

2037
```bash
@@ -29,12 +46,23 @@ nv-nsight-cu-cli --csv a.out
2946
nsys profile a.out
3047
```
3148

32-
3349
## Managing docker images
3450

3551
* `docker ps -a`
3652
* `docker rm `docker ps -a -q``
3753

54+
Run a profiling container:
55+
```bash
56+
docker run cwpearson/nvidia-performance-tools:latest-amd64
57+
```
58+
59+
Resume a previously exited container:
60+
```bash
61+
* docker ps -a # find the ID
62+
* docker start <ID> # resume the exited container
63+
* docker attach <ID> # attach a terminal to the container
64+
```
65+
3866
## Resources
3967

4068
* [Using Nvidia Nsight Systems in Containers and the Cloud](https://devblogs.nvidia.com/nvidia-nsight-systems-containers-cloud/)

docker/amd64.dockerfile renamed to amd64_10-1.dockerfile

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvidia/cuda:10.2-devel-ubuntu18.04
1+
FROM nvidia/cuda:10.1-devel-ubuntu18.04
22

33
# Set one or more individual labels
44
LABEL maintainer="Carl Pearson"
@@ -20,6 +20,8 @@ COPY NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb nsight_systems.deb
2020
RUN chmod +x nsight_compute.run
2121
RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/usr/local/NVIDIA-Nsight-Compute
2222
ENV PATH=$PATH:/usr/local/NVIDIA-Nsight-Compute
23+
RUN rm nsight_compute.run
2324

2425
# install Nsight Systems
2526
RUN dpkg -i nsight_systems.deb
27+
RUN rm nsight_systems.deb

ci/build.sh

+27-18
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,38 @@
11
set -x
2-
32
set -e
43

5-
cd docker
6-
ls -halt
4+
source ci/env.sh
5+
6+
if [[ $BUILD_DOCKER == "1" ]]; then
7+
cd $TRAVIS_BUILD_DIR
78

8-
echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
9+
echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
910

10-
TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7}
11-
DOCKER_REPO=nvidia-performance-tools
12-
DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO
13-
DOCKER_TAG=$TRAVIS_CPU_ARCH-10.2-$TRAVIS_BRANCH-$TRAVIS_COMMIT
11+
TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7}
12+
DOCKER_REPO=nvidia-performance-tools
13+
DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO
14+
DOCKER_TAG=${TRAVIS_CPU_ARCH}-10.1-$TRAVIS_BRANCH-$TRAVIS_COMMIT
1415

1516

16-
docker build -f $TRAVIS_CPU_ARCH.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG .
17-
docker push $DOCKER_SLUG:$DOCKER_TAG
17+
docker build -f ${TRAVIS_CPU_ARCH}_10-1.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG .
18+
docker push $DOCKER_SLUG:$DOCKER_TAG
1819

1920

20-
if [[ $TRAVIS_BRANCH == master ]]; then
21-
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH
22-
docker push $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH
23-
else
24-
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
25-
docker push $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
21+
if [[ $TRAVIS_BRANCH == master ]]; then
22+
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
23+
docker push $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
24+
else
25+
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
26+
docker push $DOCKER_SLUG:$TRAVIS_BRANCH-${TRAVIS_CPU_ARCH}
27+
fi
2628
fi
2729

28-
# remove the login key from the image
29-
rm -fv $HOME/.docker/config.json
30+
31+
if [[ $BUILD_TYPE != '' ]]; then
32+
cd $TRAVIS_BUILD_DIR
33+
cd sgemm
34+
mkdir -p build
35+
cd build
36+
cmake .. -DCMAKE_BUILD_TYPE=$BUILD_TYPE
37+
make VERBOSE=1
38+
fi

ci/env.sh

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
CMAKE_PREFIX=$HOME/cmake
2+
3+
export PATH=/usr/local/cuda/bin:$PATH
4+
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
5+
6+
export PATH=$CMAKE_PREFIX/bin:$PATH

ci/install_deps.sh

+34-9
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,38 @@
11
set -x
22
set -e
33

4-
sudo apt-get update
5-
sudo apt-get install -q -y wget
6-
7-
if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then
8-
wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O docker/NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb;
9-
wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O docker/nsight-compute-PPC64LE-2019.5.0.14-27346997.run;
10-
elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then
11-
wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O docker/NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb;
12-
wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O docker/nsight-compute-linux-2019.5.0.14-27346997.run;
4+
source ci/env.sh
5+
6+
# deps for building docker images
7+
if [[ $BUILD_DOCKER == "1" ]]; then
8+
cd $TRAVIS_BUILD_DIR
9+
10+
if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then
11+
wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb;
12+
wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O nsight-compute-PPC64LE-2019.5.0.14-27346997.run;
13+
elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then
14+
wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb;
15+
wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O nsight-compute-linux-2019.5.0.14-27346997.run;
16+
fi
17+
fi
18+
19+
# deps for building code
20+
if [[ $BUILD_TYPE != '' ]]; then
21+
cs $HOME
22+
23+
## install CMake
24+
wget -qSL https://github.com/Kitware/CMake/releases/download/v3.8.2/cmake-3.8.2-Linux-x86_64.tar.gz -O cmake.tar.gz
25+
mkdir -p $CMAKE_PREFIX
26+
tar -xf cmake.tar.gz --strip-components=1 -C $CMAKE_PREFIX
27+
rm cmake.tar.gz
28+
29+
## install CUDA
30+
sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
31+
CUDA102="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.2.89-1_amd64.deb"
32+
wget -SL $CUDA102 -O cuda.deb
33+
sudo dpkg -i cuda.deb
34+
sudo apt-get update
35+
sudo apt-get install -y --no-install-recommends \
36+
cuda-toolkit-10-2
37+
rm cuda.deb
1338
fi

docker/ppc64le.dockerfile renamed to ppc64le_10-1.dockerfile

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvidia/cuda-ppc64le:10.2-devel-ubuntu18.04
1+
FROM nvidia/cuda-ppc64le:10.1-devel-ubuntu18.04
22

33
# Set one or more individual labels
44
LABEL maintainer="Carl Pearson"
@@ -20,6 +20,8 @@ COPY NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb nsight_systems.deb
2020
RUN chmod +x nsight_compute.run
2121
RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/usr/local/NVIDIA-Nsight-Compute
2222
ENV PATH=$PATH:/usr/local/NVIDIA-Nsight-Compute
23+
RUN rm nsight_compute.run
2324

2425
# install Nsight Systems
2526
RUN dpkg -i nsight_systems.deb
27+
RUN rm nsight_systems.deb

sgemm/CMakeLists.txt

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
project(sgemm LANGUAGES CXX CUDA)
2+
3+
# 3.8+ for CUDA
4+
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
5+
6+
if(NOT CMAKE_BUILD_TYPE)
7+
set(CMAKE_BUILD_TYPE "Release")
8+
message(STATUS "Setting CMAKE_BUILD_TYPE=Release")
9+
endif()
10+
11+
set(CMAKE_CUDA_STANDARD 11)
12+
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
13+
14+
include_directories(PUBLIC SYSTEM include)
15+
16+
# Add line info to binaries to help with profiling
17+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
18+
19+
add_executable(sgemm-cpu cpu.cpp)
20+
add_executable(sgemm-basic basic.cu)
21+
add_executable(sgemm-tiled tiled.cu)
22+
add_executable(sgemm-regtiled-coarsened regtiled_coarsened.cu)

sgemm/basic.cu

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#include <algorithm>
2+
3+
#include <argparse/argparse.hpp>
4+
5+
#include "common.hpp"
6+
7+
/* NOTE: A and C are column major, B is row major
8+
*/
9+
__global__ void mygemm(float *c, //<! [out] and MxN matrix
10+
const float *a, //<! [in] an MxK matrix
11+
const float *b, //<! [in] an KxN matrix
12+
const int M, const int N, const int K) {
13+
14+
#define A(_i, _j) a[(_i) + (_j)*M]
15+
#define B(_i, _j) b[(_i)*N + (_j)]
16+
#define C(_i, _j) c[(_i) + (_j)*M]
17+
18+
int gidx = blockDim.x * blockIdx.x + threadIdx.x;
19+
int gidy = blockDim.y * blockIdx.y + threadIdx.y;
20+
21+
for (int i = gidy; i < M; i += gridDim.y * blockDim.y) {
22+
for (int j = gidx; j < N; j += gridDim.x * blockDim.x) {
23+
float acc = 0;
24+
for (int k = 0; k < K; ++k) {
25+
acc += A(i, k) * B(k, j);
26+
}
27+
C(i, j) = acc;
28+
}
29+
}
30+
31+
#undef A
32+
#undef B
33+
#undef C
34+
}
35+
36+
int main(int argc, char **argv) {
37+
38+
argparse::Parser parser;
39+
40+
// default matrix sizes:
41+
// A: 307 x 313
42+
// B: 313 x 311
43+
// C: 307 x 311
44+
int m = 307;
45+
int n = 311;
46+
int k = 313;
47+
48+
int nIters = 5;
49+
int nWarmup = 5;
50+
parser.add_positional(m);
51+
parser.add_positional(n);
52+
parser.add_positional(k);
53+
parser.add_option(nIters, "--iters");
54+
parser.add_option(nWarmup, "--warmup");
55+
56+
if (!parser.parse(argc, argv)) {
57+
parser.help();
58+
exit(EXIT_FAILURE);
59+
}
60+
61+
const int64_t flop = m * n * k * 2;
62+
63+
// initialize host data
64+
std::vector<float> aHost(m * k), bHost(k * n), cHost(m * n), cExpected(m * n);
65+
std::generate(aHost.begin(), aHost.end(), random_int);
66+
std::generate(bHost.begin(), bHost.end(), random_int);
67+
68+
// allocate device data
69+
float *aDev, *bDev, *cDev;
70+
CUDA_RUNTIME(cudaMalloc(&aDev, aHost.size() * sizeof(float)));
71+
CUDA_RUNTIME(cudaMalloc(&bDev, bHost.size() * sizeof(float)));
72+
CUDA_RUNTIME(cudaMalloc(&cDev, cHost.size() * sizeof(float)));
73+
74+
// copy data to device
75+
CUDA_RUNTIME(cudaMemcpy(aDev, aHost.data(), aHost.size() * sizeof(float),
76+
cudaMemcpyDefault));
77+
CUDA_RUNTIME(cudaMemcpy(bDev, bHost.data(), bHost.size() * sizeof(float),
78+
cudaMemcpyDefault));
79+
80+
// create events to time GPU kernel
81+
cudaEvent_t start, stop;
82+
CUDA_RUNTIME(cudaEventCreate(&start));
83+
CUDA_RUNTIME(cudaEventCreate(&stop));
84+
85+
// GPU kernel launch parameters
86+
dim3 dimBlock(32, 8);
87+
dim3 dimGrid;
88+
dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
89+
dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
90+
91+
// total elapsed time
92+
float elapsed = 0;
93+
94+
/* Launch the kernel nIters + nWarmup times
95+
Check for correctness on the first time.
96+
Record the time after nWarmup runs complete.
97+
*/
98+
for (int i = 0; i < nIters + nWarmup; ++i) {
99+
CUDA_RUNTIME(cudaEventRecord(start));
100+
mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
101+
CUDA_RUNTIME(cudaEventRecord(stop));
102+
CUDA_RUNTIME(cudaEventSynchronize(stop));
103+
104+
// check result once
105+
if (i == 0) {
106+
// copy result to host
107+
CUDA_RUNTIME(cudaMemcpy(cHost.data(), cDev, cHost.size() * sizeof(float),
108+
cudaMemcpyDefault));
109+
110+
// check result on host
111+
cpu_gemm(cExpected.data(), aHost.data(), bHost.data(), m, n, k);
112+
113+
for (size_t i = 0; i < cExpected.size(); ++i) {
114+
if (!equal(cExpected[i], cHost[i], 1e-6)) {
115+
std::cerr << "Error!\n";
116+
exit(EXIT_FAILURE);
117+
}
118+
}
119+
}
120+
121+
float millis;
122+
CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
123+
std::cerr << i << ": " << millis << (i >= nWarmup ? " *" : " ") << "\n";
124+
125+
// record time after warmup runs
126+
if (i >= nWarmup) {
127+
elapsed += millis;
128+
}
129+
}
130+
131+
// print results
132+
double gflops = flop / ((elapsed / nIters) / 1000) / 1e9;
133+
std::cerr << gflops << "GFLOPS (" << flop << " flop, "
134+
<< (elapsed / nIters) / 1000 << "s)\n";
135+
136+
// release resources
137+
CUDA_RUNTIME(cudaEventDestroy(start));
138+
CUDA_RUNTIME(cudaEventDestroy(stop));
139+
CUDA_RUNTIME(cudaFree(aDev));
140+
CUDA_RUNTIME(cudaFree(bDev));
141+
CUDA_RUNTIME(cudaFree(cDev));
142+
return 0;
143+
}

0 commit comments

Comments
 (0)