cwpearson
diff --git a/‎.dockerignore b/‎.dockerignore
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎.travis.yml
+10-4 b/‎.travis.yml
+10-4
diff --git a/‎README.md
+29-1 b/‎README.md
+29-1
diff --git a/‎docker/amd64.dockerfile renamed to ‎amd64_10-1.dockerfile
+3-1 b/‎docker/amd64.dockerfile renamed to ‎amd64_10-1.dockerfile
+3-1
diff --git a/‎ci/build.sh
+27-18 b/‎ci/build.sh
+27-18
diff --git a/‎ci/env.sh
+6 b/‎ci/env.sh
+6
diff --git a/‎ci/install_deps.sh
+34-9 b/‎ci/install_deps.sh
+34-9
diff --git a/‎docker/ppc64le.dockerfile renamed to ‎ppc64le_10-1.dockerfile
+3-1 b/‎docker/ppc64le.dockerfile renamed to ‎ppc64le_10-1.dockerfile
+3-1
diff --git a/‎sgemm/CMakeLists.txt
+22 b/‎sgemm/CMakeLists.txt
+22
diff --git a/‎sgemm/basic.cu
+143 b/‎sgemm/basic.cu
+143
@@ -0,0 +1,3 @@
+sgemm/build
+*.deb
+*.run
@@ -1,10 +1,16 @@
 dist: bionic
 language: minimal
-arch:
-  - ppc64le
-  - amd64
 
-docker: true
+jobs:
+  include:
+    - arch: ppc64le
+      env: BUILD_DOCKER=1
+      docker: true
+    - arch: amd64
+      env: BUILD_DOCKER=1
+      docker: true
+    - env: BUILD_TYPE=Release
+    - env: BUILD_TYPE=Debug
 
 before_script:
   - ci/install_deps.sh || travis_terminate 1;
 
@@ -15,6 +15,23 @@ docker pull cwpearson/nvidia-performance-tools/latest-ppc64le
 
 [ECE 408 Spring 2020 - Introduction to Nvidia Performance Tools](https://docs.google.com/presentation/d/1A5i3Zdh7ltOLdW7qHZ2tviXYcyl1sKvM7kRpnzOD7tQ/edit?usp=sharing)
 
+## nvcc
+
+```
+--profile                                       (-pg)                           
+        Instrument generated code/executable for use by gprof (Linux only).
+
+--debug                                         (-g)                            
+        Generate debug information for host code.
+
+--device-debug                                  (-G)                            
+        Generate debug information for device code. Turns off all optimizations.
+        Don't use for profiling; use -lineinfo instead.
+
+--generate-line-info                            (-lineinfo)                     
+        Generate line-number information for device code.
+```
+
 ## Nsight Compute
 
 ```bash
@@ -29,12 +46,23 @@ nv-nsight-cu-cli --csv a.out
 nsys profile a.out
 ```
 
-
 ## Managing docker images
 
 * `docker ps -a`
 * `docker rm `docker ps -a -q``
 
+Run a profiling container:
+```bash
+docker run cwpearson/nvidia-performance-tools:latest-amd64
+```
+
+Resume a previously exited container:
+```bash
+* docker ps -a       # find the ID
+* docker start <ID>  # resume the exited container
+* docker attach <ID> # attach a terminal to the container
+```
+
 ## Resources
 
 * [Using Nvidia Nsight Systems in Containers and the Cloud](https://devblogs.nvidia.com/nvidia-nsight-systems-containers-cloud/)
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.2-devel-ubuntu18.04
+FROM nvidia/cuda:10.1-devel-ubuntu18.04
 
 # Set one or more individual labels
 LABEL maintainer="Carl Pearson"
@@ -20,6 +20,8 @@ COPY NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb nsight_systems.deb
 RUN chmod +x nsight_compute.run
 RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/usr/local/NVIDIA-Nsight-Compute
 ENV PATH=$PATH:/usr/local/NVIDIA-Nsight-Compute
+RUN rm nsight_compute.run
 
 # install Nsight Systems
 RUN dpkg -i nsight_systems.deb
+RUN rm nsight_systems.deb
@@ -1,29 +1,38 @@
 set -x
-
 set -e
 
-cd docker
-ls -halt
+source ci/env.sh
+
+if [[ $BUILD_DOCKER == "1" ]]; then
+    cd $TRAVIS_BUILD_DIR
 
-echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
+    echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
 
-TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7}
-DOCKER_REPO=nvidia-performance-tools
-DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO
-DOCKER_TAG=$TRAVIS_CPU_ARCH-10.2-$TRAVIS_BRANCH-$TRAVIS_COMMIT
+    TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7}
+    DOCKER_REPO=nvidia-performance-tools
+    DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO
+    DOCKER_TAG=${TRAVIS_CPU_ARCH}-10.1-$TRAVIS_BRANCH-$TRAVIS_COMMIT
 
 
-docker build -f $TRAVIS_CPU_ARCH.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG .
-docker push $DOCKER_SLUG:$DOCKER_TAG
+    docker build -f ${TRAVIS_CPU_ARCH}_10-1.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG .
+    docker push $DOCKER_SLUG:$DOCKER_TAG
 
 
-if [[ $TRAVIS_BRANCH == master ]]; then
-    docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH
-    docker push $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH
-else
-    docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
-    docker push $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
+    if [[ $TRAVIS_BRANCH == master ]]; then
+        docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
+        docker push $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
+    else
+        docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
+        docker push $DOCKER_SLUG:$TRAVIS_BRANCH-${TRAVIS_CPU_ARCH}
+    fi
 fi
 
-# remove the login key from the image
-rm -fv $HOME/.docker/config.json
+
+if [[ $BUILD_TYPE != '' ]]; then
+    cd $TRAVIS_BUILD_DIR
+    cd sgemm
+    mkdir -p build
+    cd build
+    cmake .. -DCMAKE_BUILD_TYPE=$BUILD_TYPE
+    make VERBOSE=1
+fi
@@ -0,0 +1,6 @@
+CMAKE_PREFIX=$HOME/cmake
+
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+export PATH=$CMAKE_PREFIX/bin:$PATH
@@ -1,13 +1,38 @@
 set -x
 set -e
 
-sudo apt-get update
-sudo apt-get install -q -y wget
-
-if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then
-  wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O docker/NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb;
-  wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O docker/nsight-compute-PPC64LE-2019.5.0.14-27346997.run;
-elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then 
-  wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O docker/NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb;
-  wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O docker/nsight-compute-linux-2019.5.0.14-27346997.run;
+source ci/env.sh
+
+# deps for building docker images
+if [[ $BUILD_DOCKER == "1" ]]; then
+  cd $TRAVIS_BUILD_DIR
+
+  if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then
+    wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb;
+    wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O nsight-compute-PPC64LE-2019.5.0.14-27346997.run;
+  elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then 
+    wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb;
+    wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O nsight-compute-linux-2019.5.0.14-27346997.run;
+  fi
+fi
+
+# deps for building code
+if [[ $BUILD_TYPE != '' ]]; then
+    cs $HOME
+
+    ## install CMake
+    wget -qSL https://github.com/Kitware/CMake/releases/download/v3.8.2/cmake-3.8.2-Linux-x86_64.tar.gz -O cmake.tar.gz
+    mkdir -p $CMAKE_PREFIX
+    tar -xf cmake.tar.gz --strip-components=1 -C $CMAKE_PREFIX
+    rm cmake.tar.gz
+
+    ## install CUDA
+    sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
+    CUDA102="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.2.89-1_amd64.deb"
+    wget -SL $CUDA102 -O cuda.deb
+    sudo dpkg -i cuda.deb
+    sudo apt-get update 
+    sudo apt-get install -y --no-install-recommends \
+        cuda-toolkit-10-2
+    rm cuda.deb
 fi
@@ -1,4 +1,4 @@
-FROM nvidia/cuda-ppc64le:10.2-devel-ubuntu18.04
+FROM nvidia/cuda-ppc64le:10.1-devel-ubuntu18.04
 
 # Set one or more individual labels
 LABEL maintainer="Carl Pearson"
@@ -20,6 +20,8 @@ COPY NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb nsight_systems.deb
 RUN chmod +x nsight_compute.run
 RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/usr/local/NVIDIA-Nsight-Compute
 ENV PATH=$PATH:/usr/local/NVIDIA-Nsight-Compute
+RUN rm nsight_compute.run
 
 # install Nsight Systems
 RUN dpkg -i nsight_systems.deb
+RUN rm nsight_systems.deb
@@ -0,0 +1,22 @@
+project(sgemm LANGUAGES CXX CUDA)
+
+# 3.8+ for CUDA
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+    message(STATUS "Setting CMAKE_BUILD_TYPE=Release")
+endif()
+
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+
+include_directories(PUBLIC SYSTEM include)
+
+# Add line info to binaries to help with profiling
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
+
+add_executable(sgemm-cpu cpu.cpp)
+add_executable(sgemm-basic basic.cu)
+add_executable(sgemm-tiled tiled.cu)
+add_executable(sgemm-regtiled-coarsened regtiled_coarsened.cu)
@@ -0,0 +1,143 @@
+#include <algorithm>
+
+#include <argparse/argparse.hpp>
+
+#include "common.hpp"
+
+/* NOTE: A and C are column major, B is row major
+ */
+__global__ void mygemm(float *c,       //<! [out] and MxN matrix
+                       const float *a, //<! [in] an MxK matrix
+                       const float *b, //<! [in] an KxN matrix
+                       const int M, const int N, const int K) {
+
+#define A(_i, _j) a[(_i) + (_j)*M]
+#define B(_i, _j) b[(_i)*N + (_j)]
+#define C(_i, _j) c[(_i) + (_j)*M]
+
+  int gidx = blockDim.x * blockIdx.x + threadIdx.x;
+  int gidy = blockDim.y * blockIdx.y + threadIdx.y;
+
+  for (int i = gidy; i < M; i += gridDim.y * blockDim.y) {
+    for (int j = gidx; j < N; j += gridDim.x * blockDim.x) {
+      float acc = 0;
+      for (int k = 0; k < K; ++k) {
+        acc += A(i, k) * B(k, j);
+      }
+      C(i, j) = acc;
+    }
+  }
+
+#undef A
+#undef B
+#undef C
+}
+
+int main(int argc, char **argv) {
+
+  argparse::Parser parser;
+
+  // default matrix sizes:
+  // A: 307 x 313
+  // B: 313 x 311
+  // C: 307 x 311
+  int m = 307;
+  int n = 311;
+  int k = 313;
+
+  int nIters = 5;
+  int nWarmup = 5;
+  parser.add_positional(m);
+  parser.add_positional(n);
+  parser.add_positional(k);
+  parser.add_option(nIters, "--iters");
+  parser.add_option(nWarmup, "--warmup");
+
+  if (!parser.parse(argc, argv)) {
+    parser.help();
+    exit(EXIT_FAILURE);
+  }
+
+  const int64_t flop = m * n * k * 2;
+
+  // initialize host data
+  std::vector<float> aHost(m * k), bHost(k * n), cHost(m * n), cExpected(m * n);
+  std::generate(aHost.begin(), aHost.end(), random_int);
+  std::generate(bHost.begin(), bHost.end(), random_int);
+
+  // allocate device data
+  float *aDev, *bDev, *cDev;
+  CUDA_RUNTIME(cudaMalloc(&aDev, aHost.size() * sizeof(float)));
+  CUDA_RUNTIME(cudaMalloc(&bDev, bHost.size() * sizeof(float)));
+  CUDA_RUNTIME(cudaMalloc(&cDev, cHost.size() * sizeof(float)));
+
+  // copy data to device
+  CUDA_RUNTIME(cudaMemcpy(aDev, aHost.data(), aHost.size() * sizeof(float),
+                          cudaMemcpyDefault));
+  CUDA_RUNTIME(cudaMemcpy(bDev, bHost.data(), bHost.size() * sizeof(float),
+                          cudaMemcpyDefault));
+
+  // create events to time GPU kernel
+  cudaEvent_t start, stop;
+  CUDA_RUNTIME(cudaEventCreate(&start));
+  CUDA_RUNTIME(cudaEventCreate(&stop));
+
+  // GPU kernel launch parameters
+  dim3 dimBlock(32, 8);
+  dim3 dimGrid;
+  dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
+  dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;
+
+  // total elapsed time
+  float elapsed = 0;
+
+  /* Launch the kernel nIters + nWarmup times
+     Check for correctness on the first time.
+     Record the time after nWarmup runs complete.
+  */
+  for (int i = 0; i < nIters + nWarmup; ++i) {
+    CUDA_RUNTIME(cudaEventRecord(start));
+    mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
+    CUDA_RUNTIME(cudaEventRecord(stop));
+    CUDA_RUNTIME(cudaEventSynchronize(stop));
+
+    // check result once
+    if (i == 0) {
+      // copy result to host
+      CUDA_RUNTIME(cudaMemcpy(cHost.data(), cDev, cHost.size() * sizeof(float),
+                              cudaMemcpyDefault));
+
+      // check result on host
+      cpu_gemm(cExpected.data(), aHost.data(), bHost.data(), m, n, k);
+
+      for (size_t i = 0; i < cExpected.size(); ++i) {
+        if (!equal(cExpected[i], cHost[i], 1e-6)) {
+          std::cerr << "Error!\n";
+          exit(EXIT_FAILURE);
+        }
+      }
+    }
+
+    float millis;
+    CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
+    std::cerr << i << ": " << millis << (i >= nWarmup ? " *" : " ") << "\n";
+
+    // record time after warmup runs
+    if (i >= nWarmup) {
+      elapsed += millis;
+    }
+  }
+
+  // print results
+  double gflops = flop / ((elapsed / nIters) / 1000) / 1e9;
+  std::cerr << gflops << "GFLOPS (" << flop << " flop, "
+            << (elapsed / nIters) / 1000 << "s)\n";
+
+  // release resources
+  CUDA_RUNTIME(cudaEventDestroy(start));
+  CUDA_RUNTIME(cudaEventDestroy(stop));
+  CUDA_RUNTIME(cudaFree(aDev));
+  CUDA_RUNTIME(cudaFree(bDev));
+  CUDA_RUNTIME(cudaFree(cDev));
+  return 0;
+}