BugelNiels
diff --git a/‎Makefile
+103 b/‎Makefile
+103
diff --git a/‎README.md
+116-2 b/‎README.md
+116-2
diff --git a/‎benchmark.sh
+22 b/‎benchmark.sh
+22
diff --git a/‎generateJob.sh
+33 b/‎generateJob.sh
+33
diff --git a/‎kernels/average.kernel
+4 b/‎kernels/average.kernel
+4
diff --git a/‎kernels/gaussianBlur.kernel
+6 b/‎kernels/gaussianBlur.kernel
+6
diff --git a/‎kernels/identity.kernel
+4 b/‎kernels/identity.kernel
+4
diff --git a/‎kernels/square.kernel
+12 b/‎kernels/square.kernel
+12
diff --git a/‎src/combinerMain.c
+69 b/‎src/combinerMain.c
+69
@@ -0,0 +1,103 @@
+# --- MACROS
+MAIN = conv
+COMBINER = combiner
+SPLITTER = splitter
+
+# define the C compiler to use
+MPICC = mpicc
+NVCC = nvcc
+CC = gcc
+
+SRCDIR = src
+OBJDIR = build
+
+# add -ltiff here if tiff images are to be used
+LIBS = -lm -ltiff
+CU_LIBS = -lcudart -lcudadevrt
+
+ifndef KERNEL_SCRIPT
+	KERNEL_SCRIPT = processingScripts/processing.cu
+endif
+
+# path to config file
+CONF_FLAGS = -Isrc/configs
+# define any compile-time flags
+CFLAGS = -Wall -pedantic -lpthread $(CONF_FLAGS) -fopenmp
+NVCC_FLAGS = -lpthread -rdc=true $(CONF_FLAGS)
+
+ifndef RELEASE
+	CFLAGS += -pg -g
+	NVCC_FLAGS += -pg -g -G
+else
+	CFLAGS += -O3
+	NVCC_FLAGS += -O3 -Xptxas -O3
+endif
+
+
+# main files to be compiled with gcc
+
+SPLITTER_MAIN_SRC = src/splitterMain.c
+SPLITTER_MAIN_OBJ = $(SPLITTER_MAIN_SRC:%.c= $(OBJDIR)/%.o)
+
+COMBINER_MAIN_SRC = src/combinerMain.c
+COMBINER_MAIN_OBJ = $(COMBINER_MAIN_SRC:%.c= $(OBJDIR)/%.o)
+
+CONV_MAIN_SRC = src/convMain.c
+CONV_MAIN_OBJ = $(CONV_MAIN_SRC:%.c= $(OBJDIR)/%.o)
+
+# c files to be compiled with gcc
+C_SRCS_B = util/util.c imageFormats/pgmformat.c imageFormats/tiffformat.c imageFormats/kernelformat.c job/job.c
+C_SRCS = $(addprefix src/, $(C_SRCS_B))
+C_OBJS = $(C_SRCS:%.c= $(OBJDIR)/%.o)
+
+# c files to be compiled with mpicc
+MPI_SRCS_B = worker.c master.c jobExecution.c
+MPI_SRCS = $(addprefix src/mpi/, $(MPI_SRCS_B))
+MPI_OBJS = $(MPI_SRCS:%.c= $(OBJDIR)/%.o)
+
+# cuda files to be compiled with nvcc
+CU_SRCS_B = util/cudaUtils.cu cudaInvoker.cu $(KERNEL_SCRIPT) kernels/convKernels.cu io/fastpgm.cu io/fastBufferIO.cu cudaInvokerAsync.cu
+CU_SRCS = $(addprefix src/cuda/, $(CU_SRCS_B))
+CU_OBJS = $(CU_SRCS:%.cu=$(OBJDIR)/%.cu.o)
+CU_LINK_OBJS = $(CU_SRCS:%.cu=$(OBJDIR)/%.cu.link.o)
+
+# --- TARGETS
+
+all: $(MAIN) $(COMBINER) $(SPLITTER)
+
+$(SPLITTER): $(SPLITTER_MAIN_SRC) $(C_OBJS)
+	@mkdir -p $(@D)
+	@echo #
+	@echo "-- CREATING SPLITTER --"
+	$(CC) $(CFLAGS) $(MAIN_FLAGS) -o $@ $^ $(LIBS) -fopenmp
+
+$(COMBINER): $(COMBINER_MAIN_SRC) $(C_OBJS)
+	@echo #
+	@echo "-- CREATING COMBINER --"
+	$(CC) $(CFLAGS) -o $@ $^ $(LIBS) -fopenmp
+
+$(MAIN): $(CONV_MAIN_SRC) $(C_OBJS) $(MPI_OBJS) $(CU_OBJS) $(CU_LINK_OBJS)
+	@echo #
+	@echo "-- CREATING CONVOLUTION PROGRAM --"
+	$(MPICC) $(CFLAGS) -o $@ $^ $(LIBS) $(CU_LIBS) -lstdc++
+
+
+# c, mpi and cuda objects
+
+$(C_OBJS): $(OBJDIR)/%.o : %.c
+	@mkdir -p $(@D)
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+$(MPI_OBJS): $(OBJDIR)/%.o : %.c
+	@mkdir -p $(@D)
+	$(MPICC) -c $(CFLAGS) -o $@ $<
+
+$(CU_OBJS): $(OBJDIR)/%.cu.o : %.cu
+	@mkdir -p $(@D)
+	$(NVCC) $(NVCC_FLAGS) -c -o $@ $<
+	$(NVCC) -dlink -o $(basename $@).link.o $@ -lcudart
+
+clean:
+	@echo #
+	@echo "-- CLEANING PROJECT FILES --"
+	$(RM) $(OBJDIR)/$(SRCDIR) -r $(MAIN) $(COMBINER) $(SPLITTER)
@@ -1,2 +1,116 @@
-# distributed-gpu-convolution
-This repository contains a framework with a GPU implementation of generalized convolution operators. The framework is designed for large image data sets and can run in a distributed system.
+<br />
+<p align="center">
+  <h1 align="center">Generalized Convolution Operators</h1>
+
+  <p align="center">
+    A fast distributed GPU-based convolution algorithm using CUDA, MPI and pthreads.
+  </p>
+</p>
+
+## About The Project
+
+Common image processing operators such as Gaussian blurring, certain edge detectors, dilations and erosions can all be expressed as convolutions. Performing convolutions on large image data sets takes a significant amount of time. To improve the performance of these operators, parallelization strategies can be employed. We propose GenConv: a framework that can run in a distributed setup and makes use of CUDA to perform convolution operators on the GPU. It provides the ability to do convolutions, dilations and erosions. The programmer can chain and customize these operations in any way they see fit.
+
+## Getting Started
+
+To get a local copy up and running follow these simple steps.
+
+### Prerequisites
+
+You need to the following to be able to compile and run the project
+
+* [Make](https://www.gnu.org/software/make/)
+* [CUDA](https://developer.nvidia.com/cuda-toolkit)
+* [MPI](https://www.open-mpi.org/)
+
+### Setup
+
+To set up the program, run the following commands:
+```sh
+    git clone [email protected]:BugelNiels/distributed-gpu-convolution.git
+    cd distributed-gpu-convolution.git
+```
+
+### Compilation
+
+The compilation is done via Make:
+```sh
+make
+```
+
+The makefile takes two optional argumens:
+```sh
+make RELEASE=1
+```
+Will create a release build of the program. The default is a debug build.
+```sh
+make KERNEL_SCRIPT=processingScripts/processing.cu
+```
+This will set the processing script to use to be `src/cuda/processingScripts/processing.cu`. This is useful for when multiple scripts are present and you want to switch between them on consecutive runs. Note that only one processing script can be used at the time.
+
+All of these optional arguments can be used at the same time.
+
+### Running
+
+#### SLURM
+
+To run the program on a slurm cluster you can look at one of the `benchmark.sh` scripts for inspiration. Provided the configuration is correct, you can use:
+
+```sh
+srun ./conv job.txt outputdir
+```
+
+#### Single machine
+
+You can run the project on a single machine as follows
+
+```sh
+mpirun -np 1 ./conv job.txt outputDir
+```
+
+Alternatively it can also be run without MPI:
+
+```sh
+./conv job.txt outputDir
+```
+
+# Job files
+
+GenConv uses a custom format job file that contains some basic information about the type of images that is received and which images to process. The file follows the following format:
+```
+3
+8
+256 256
+0 0
+inputImages/image1.pgm
+inputImages/image2.pgm
+inputImages/image3.pgm
+```
+The first line indicates the number of images to process. The second line states the number of bits used for each pixel (the dynamic range). The third line indicates the maximum dimensions any given image in the job can have. The fourth line indicates how many pixels are padded on the side of each dimension. Next are all the images that should be processed.
+
+As of now, only `.pgm` images are supported. The implementation was done in such a way that the addition of additional image formats is very straightforward.
+
+# Kernels
+
+The application supports very simple convolution kernel formats. These follow the following format:
+```
+3 3
+0 1 0
+1 -4 1
+0 1 0
+```
+The first line indicates the `width`x`height` of the kernel. Next are `height` lines with on each line `width` elements of the kernel.
+
+# Making changes to the Image Processing
+
+The processing steps the programming does is defined in `src/cuda/processingScripts/processing.cu`. You can either alter this file or add a new file and pass this file as a make argument.
+
+Image processing is often a matter of connecting small lego blocks in any way that your use case sees fit. This is impossible to do via a configuration system without significant performance penalties. As such, it is up to the programmer to define the sequence of operations they want to do. A very basic understanding of CUDA is required to achieve optimal performance here.
+
+When making changes to the script, the `cudaConfig.h` in `src/configs` should be updated accordingly. In particular the maximum kernel dimensions. CUDA needs to know this, because constant memory cannot be dynamically allocated at runtime.
+
+Ideally, the only two places the programmer ever needs to change things is in their processing script and a slight update to the `cudaConfig.h` to accomodate for any kernels they might use.
+
+# Splitter & Combiner
+
+The application also compiles to additional executables: `splitter` and `combiner`. The `splitter` can be used to either split a single image into multiple smaller tiles (optionally with padding). The `combiner` can be used to combine tiles into a single image again. Note that the combiner requires the input tiles to follow the same naming convention as the tiles generated by the splitted.
@@ -0,0 +1,22 @@
+#!/bin/bash
+#
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:v100
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:02:00
+#SBATCH --job-name=dilation4
+#SBATCH --output=result4_2.out
+#
+
+# Compile the program
+make RELEASE=1
+
+# Specify the number of threads that OpenMP applications can use.
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+module load foss/2020a
+module load CUDA/11.1.1
+srun ./conv benchmark.txt /data/s3405583/outputTiles4
+
+make clean
@@ -0,0 +1,33 @@
+read -p 'Job file name: ' -r outputfile
+read -p 'Input image directory: ' -r imDir
+
+filenames=`ls $imDir*.pgm` || exit 1
+# empty file and write length to the file
+echo "$filenames" | wc -w > $outputfile
+
+maxWidth=0
+maxHeight=0
+for path in $filenames
+do
+    nums=$(head -3 $path)
+		stringArray=($nums)
+		curWidth=${stringArray[1]}
+		curHeight=${stringArray[2]}
+		maxWidth=$(( $curWidth > $maxWidth ? $curWidth : $maxWidth ))
+		maxHeight=$(( $curHeight > $maxHeight ? $curHeight : $maxHeight ))
+done
+
+read -p 'Dynamic range (in bits): ' numBits
+echo $numBits >> $outputfile
+
+echo $maxWidth $maxHeight >> $outputfile
+
+read -p 'Number of pixels padded horizontally: ' padX
+read -p 'Number of pixels padded vertically: ' padY
+echo $padX $padY >> $outputfile
+
+#write each file to the list
+for eachfile in $filenames
+do
+   echo $eachfile >> $outputfile
+done
@@ -0,0 +1,4 @@
+3 3
+1 1 1
+1 1 1
+1 1 1
@@ -0,0 +1,6 @@
+5 5
+1 4 6 4 1
+4 16 24 16 4
+6 24 36 24 6
+4 16 24 16 4
+1 4 6 4 1
@@ -0,0 +1,4 @@
+3 3
+0 0 0
+0 1 0
+0 0 0
@@ -0,0 +1,12 @@
+11 11
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 06
@@ -0,0 +1,69 @@
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "imageFormats/pgmformat.h"
+#include "job/job.h"
+#include "util/util.h"
+
+void insertTileIntoRaster(int *raster, int rasterWidth, int rasterHeight, int *tilePixels, int blockWidth,
+                          int blockHeight, int blockX, int blockY) {
+  int startX = blockX * blockWidth;
+  int startY = blockY * blockHeight;
+  for (int y = 0; y < blockHeight; y++) {
+    for (int x = 0; x < blockWidth; x++) {
+      int pixX = startX + x;
+      int pixY = startY + y;
+      if (pixX >= 0 && pixX < rasterWidth && pixY >= 0 && pixY < rasterHeight) {
+        raster[pixY * rasterWidth + pixX] = tilePixels[y * blockWidth + x];
+      }
+    }
+  }
+}
+
+int *combineTiles(int width, int height, int blockWidth, int blockHeight, const char *tileDir) {
+  int nVertImages = (height + (blockHeight - 1)) / blockHeight;
+  int nHorImages = (width + (blockWidth - 1)) / blockWidth;
+  int *raster = calloc(width * height, sizeof(int));
+#pragma omp parallel
+  {
+    char buffer[256];
+    int *tilePixels = malloc(blockWidth * blockHeight * sizeof(int));
+#pragma omp for
+    for (int y = 0; y < nVertImages; y++) {
+      for (int x = 0; x < nHorImages; x++) {
+        sprintf(buffer, "%s/block_x%d_y%d.pgm", tileDir, x, y);
+        loadPgmImageToMem(buffer, tilePixels);
+        insertTileIntoRaster(raster, width, height, tilePixels, blockWidth, blockHeight, x, y);
+      }
+    }
+    free(tilePixels);
+  }
+
+  return raster;
+}
+
+int main(int argc, char *argv[]) {
+  if (argc != 8) {
+    printf(
+        "Usage: ./combiner finalImgWidth finalImgHeight tileWidth tileHeight numBits outputFile "
+        "inputTileDirectory\nExample:\n\t ./combiner 256 256 64 64 8 result.pgm outputTiles\n");
+    return 0;
+  }
+
+  int finalWidth = atoi(argv[1]);
+  int finalHeight = atoi(argv[2]);
+  int blockWidth = atoi(argv[3]);
+  int blockHeight = atoi(argv[4]);
+  int numBits = atoi(argv[5]);
+  const char *path = argv[6];
+  const char *tileDir = argv[7];
+
+  // make output tile directory. Should fail if it already exists
+  int *pixels = combineTiles(finalWidth, finalHeight, blockWidth, blockHeight, tileDir);
+  saveAsPgmImage(path, pixels, finalWidth, finalHeight, numBits);
+  printf("Result saved to %s\n", path);
+  free(pixels);
+  return 0;
+}
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +3 3
 +1 1 1
 +1 1 1
 +1 1 1