Skip to content

Commit 80718b9

Browse files
committed
Initial commit
1 parent aa05058 commit 80718b9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+2830
-2
lines changed

Makefile

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# --- MACROS
2+
MAIN = conv
3+
COMBINER = combiner
4+
SPLITTER = splitter
5+
6+
# define the C compiler to use
7+
MPICC = mpicc
8+
NVCC = nvcc
9+
CC = gcc
10+
11+
SRCDIR = src
12+
OBJDIR = build
13+
14+
# add -ltiff here if tiff images are to be used
15+
LIBS = -lm -ltiff
16+
CU_LIBS = -lcudart -lcudadevrt
17+
18+
ifndef KERNEL_SCRIPT
19+
KERNEL_SCRIPT = processingScripts/processing.cu
20+
endif
21+
22+
# path to config file
23+
CONF_FLAGS = -Isrc/configs
24+
# define any compile-time flags
25+
CFLAGS = -Wall -pedantic -lpthread $(CONF_FLAGS) -fopenmp
26+
NVCC_FLAGS = -lpthread -rdc=true $(CONF_FLAGS)
27+
28+
ifndef RELEASE
29+
CFLAGS += -pg -g
30+
NVCC_FLAGS += -pg -g -G
31+
else
32+
CFLAGS += -O3
33+
NVCC_FLAGS += -O3 -Xptxas -O3
34+
endif
35+
36+
37+
# main files to be compiled with gcc
38+
39+
SPLITTER_MAIN_SRC = src/splitterMain.c
40+
SPLITTER_MAIN_OBJ = $(SPLITTER_MAIN_SRC:%.c= $(OBJDIR)/%.o)
41+
42+
COMBINER_MAIN_SRC = src/combinerMain.c
43+
COMBINER_MAIN_OBJ = $(COMBINER_MAIN_SRC:%.c= $(OBJDIR)/%.o)
44+
45+
CONV_MAIN_SRC = src/convMain.c
46+
CONV_MAIN_OBJ = $(CONV_MAIN_SRC:%.c= $(OBJDIR)/%.o)
47+
48+
# c files to be compiled with gcc
49+
C_SRCS_B = util/util.c imageFormats/pgmformat.c imageFormats/tiffformat.c imageFormats/kernelformat.c job/job.c
50+
C_SRCS = $(addprefix src/, $(C_SRCS_B))
51+
C_OBJS = $(C_SRCS:%.c= $(OBJDIR)/%.o)
52+
53+
# c files to be compiled with mpicc
54+
MPI_SRCS_B = worker.c master.c jobExecution.c
55+
MPI_SRCS = $(addprefix src/mpi/, $(MPI_SRCS_B))
56+
MPI_OBJS = $(MPI_SRCS:%.c= $(OBJDIR)/%.o)
57+
58+
# cuda files to be compiled with nvcc
59+
CU_SRCS_B = util/cudaUtils.cu cudaInvoker.cu $(KERNEL_SCRIPT) kernels/convKernels.cu io/fastpgm.cu io/fastBufferIO.cu cudaInvokerAsync.cu
60+
CU_SRCS = $(addprefix src/cuda/, $(CU_SRCS_B))
61+
CU_OBJS = $(CU_SRCS:%.cu=$(OBJDIR)/%.cu.o)
62+
CU_LINK_OBJS = $(CU_SRCS:%.cu=$(OBJDIR)/%.cu.link.o)
63+
64+
# --- TARGETS
65+
66+
all: $(MAIN) $(COMBINER) $(SPLITTER)
67+
68+
$(SPLITTER): $(SPLITTER_MAIN_SRC) $(C_OBJS)
69+
@mkdir -p $(@D)
70+
@echo #
71+
@echo "-- CREATING SPLITTER --"
72+
$(CC) $(CFLAGS) $(MAIN_FLAGS) -o $@ $^ $(LIBS) -fopenmp
73+
74+
$(COMBINER): $(COMBINER_MAIN_SRC) $(C_OBJS)
75+
@echo #
76+
@echo "-- CREATING COMBINER --"
77+
$(CC) $(CFLAGS) -o $@ $^ $(LIBS) -fopenmp
78+
79+
$(MAIN): $(CONV_MAIN_SRC) $(C_OBJS) $(MPI_OBJS) $(CU_OBJS) $(CU_LINK_OBJS)
80+
@echo #
81+
@echo "-- CREATING CONVOLUTION PROGRAM --"
82+
$(MPICC) $(CFLAGS) -o $@ $^ $(LIBS) $(CU_LIBS) -lstdc++
83+
84+
85+
# c, mpi and cuda objects
86+
87+
$(C_OBJS): $(OBJDIR)/%.o : %.c
88+
@mkdir -p $(@D)
89+
$(CC) -c $(CFLAGS) -o $@ $<
90+
91+
$(MPI_OBJS): $(OBJDIR)/%.o : %.c
92+
@mkdir -p $(@D)
93+
$(MPICC) -c $(CFLAGS) -o $@ $<
94+
95+
$(CU_OBJS): $(OBJDIR)/%.cu.o : %.cu
96+
@mkdir -p $(@D)
97+
$(NVCC) $(NVCC_FLAGS) -c -o $@ $<
98+
$(NVCC) -dlink -o $(basename $@).link.o $@ -lcudart
99+
100+
clean:
101+
@echo #
102+
@echo "-- CLEANING PROJECT FILES --"
103+
$(RM) $(OBJDIR)/$(SRCDIR) -r $(MAIN) $(COMBINER) $(SPLITTER)

README.md

+116-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,116 @@
1-
# distributed-gpu-convolution
2-
This repository contains a framework with a GPU implementation of generalized convolution operators. The framework is designed for large image data sets and can run in a distributed system.
1+
<br />
2+
<p align="center">
3+
<h1 align="center">Generalized Convolution Operators</h1>
4+
5+
<p align="center">
6+
A fast distributed GPU-based convolution algorithm using CUDA, MPI and pthreads.
7+
</p>
8+
</p>
9+
10+
## About The Project
11+
12+
Common image processing operators such as Gaussian blurring, certain edge detectors, dilations and erosions can all be expressed as convolutions. Performing convolutions on large image data sets takes a significant amount of time. To improve the performance of these operators, parallelization strategies can be employed. We propose GenConv: a framework that can run in a distributed setup and makes use of CUDA to perform convolution operators on the GPU. It provides the ability to do convolutions, dilations and erosions. The programmer can chain and customize these operations in any way they see fit.
13+
14+
## Getting Started
15+
16+
To get a local copy up and running follow these simple steps.
17+
18+
### Prerequisites
19+
20+
You need to the following to be able to compile and run the project
21+
22+
* [Make](https://www.gnu.org/software/make/)
23+
* [CUDA](https://developer.nvidia.com/cuda-toolkit)
24+
* [MPI](https://www.open-mpi.org/)
25+
26+
### Setup
27+
28+
To set up the program, run the following commands:
29+
```sh
30+
git clone [email protected]:BugelNiels/distributed-gpu-convolution.git
31+
cd distributed-gpu-convolution.git
32+
```
33+
34+
### Compilation
35+
36+
The compilation is done via Make:
37+
```sh
38+
make
39+
```
40+
41+
The makefile takes two optional argumens:
42+
```sh
43+
make RELEASE=1
44+
```
45+
Will create a release build of the program. The default is a debug build.
46+
```sh
47+
make KERNEL_SCRIPT=processingScripts/processing.cu
48+
```
49+
This will set the processing script to use to be `src/cuda/processingScripts/processing.cu`. This is useful for when multiple scripts are present and you want to switch between them on consecutive runs. Note that only one processing script can be used at the time.
50+
51+
All of these optional arguments can be used at the same time.
52+
53+
### Running
54+
55+
#### SLURM
56+
57+
To run the program on a slurm cluster you can look at one of the `benchmark.sh` scripts for inspiration. Provided the configuration is correct, you can use:
58+
59+
```sh
60+
srun ./conv job.txt outputdir
61+
```
62+
63+
#### Single machine
64+
65+
You can run the project on a single machine as follows
66+
67+
```sh
68+
mpirun -np 1 ./conv job.txt outputDir
69+
```
70+
71+
Alternatively it can also be run without MPI:
72+
73+
```sh
74+
./conv job.txt outputDir
75+
```
76+
77+
# Job files
78+
79+
GenConv uses a custom format job file that contains some basic information about the type of images that is received and which images to process. The file follows the following format:
80+
```
81+
3
82+
8
83+
256 256
84+
0 0
85+
inputImages/image1.pgm
86+
inputImages/image2.pgm
87+
inputImages/image3.pgm
88+
```
89+
The first line indicates the number of images to process. The second line states the number of bits used for each pixel (the dynamic range). The third line indicates the maximum dimensions any given image in the job can have. The fourth line indicates how many pixels are padded on the side of each dimension. Next are all the images that should be processed.
90+
91+
As of now, only `.pgm` images are supported. The implementation was done in such a way that the addition of additional image formats is very straightforward.
92+
93+
# Kernels
94+
95+
The application supports very simple convolution kernel formats. These follow the following format:
96+
```
97+
3 3
98+
0 1 0
99+
1 -4 1
100+
0 1 0
101+
```
102+
The first line indicates the `width`x`height` of the kernel. Next are `height` lines with on each line `width` elements of the kernel.
103+
104+
# Making changes to the Image Processing
105+
106+
The processing steps the programming does is defined in `src/cuda/processingScripts/processing.cu`. You can either alter this file or add a new file and pass this file as a make argument.
107+
108+
Image processing is often a matter of connecting small lego blocks in any way that your use case sees fit. This is impossible to do via a configuration system without significant performance penalties. As such, it is up to the programmer to define the sequence of operations they want to do. A very basic understanding of CUDA is required to achieve optimal performance here.
109+
110+
When making changes to the script, the `cudaConfig.h` in `src/configs` should be updated accordingly. In particular the maximum kernel dimensions. CUDA needs to know this, because constant memory cannot be dynamically allocated at runtime.
111+
112+
Ideally, the only two places the programmer ever needs to change things is in their processing script and a slight update to the `cudaConfig.h` to accomodate for any kernels they might use.
113+
114+
# Splitter & Combiner
115+
116+
The application also compiles to additional executables: `splitter` and `combiner`. The `splitter` can be used to either split a single image into multiple smaller tiles (optionally with padding). The `combiner` can be used to combine tiles into a single image again. Note that the combiner requires the input tiles to follow the same naming convention as the tiles generated by the splitted.

benchmark.sh

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
#
3+
#SBATCH --partition=gpu
4+
#SBATCH --gres=gpu:v100
5+
#SBATCH --nodes=4
6+
#SBATCH --ntasks-per-node=1
7+
#SBATCH --cpus-per-task=4
8+
#SBATCH --time=00:02:00
9+
#SBATCH --job-name=dilation4
10+
#SBATCH --output=result4_2.out
11+
#
12+
13+
# Compile the program
14+
make RELEASE=1
15+
16+
# Specify the number of threads that OpenMP applications can use.
17+
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
18+
module load foss/2020a
19+
module load CUDA/11.1.1
20+
srun ./conv benchmark.txt /data/s3405583/outputTiles4
21+
22+
make clean

generateJob.sh

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
read -p 'Job file name: ' -r outputfile
2+
read -p 'Input image directory: ' -r imDir
3+
4+
filenames=`ls $imDir*.pgm` || exit 1
5+
# empty file and write length to the file
6+
echo "$filenames" | wc -w > $outputfile
7+
8+
maxWidth=0
9+
maxHeight=0
10+
for path in $filenames
11+
do
12+
nums=$(head -3 $path)
13+
stringArray=($nums)
14+
curWidth=${stringArray[1]}
15+
curHeight=${stringArray[2]}
16+
maxWidth=$(( $curWidth > $maxWidth ? $curWidth : $maxWidth ))
17+
maxHeight=$(( $curHeight > $maxHeight ? $curHeight : $maxHeight ))
18+
done
19+
20+
read -p 'Dynamic range (in bits): ' numBits
21+
echo $numBits >> $outputfile
22+
23+
echo $maxWidth $maxHeight >> $outputfile
24+
25+
read -p 'Number of pixels padded horizontally: ' padX
26+
read -p 'Number of pixels padded vertically: ' padY
27+
echo $padX $padY >> $outputfile
28+
29+
#write each file to the list
30+
for eachfile in $filenames
31+
do
32+
echo $eachfile >> $outputfile
33+
done

kernels/average.kernel

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
3 3
2+
1 1 1
3+
1 1 1
4+
1 1 1

kernels/gaussianBlur.kernel

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
5 5
2+
1 4 6 4 1
3+
4 16 24 16 4
4+
6 24 36 24 6
5+
4 16 24 16 4
6+
1 4 6 4 1

kernels/identity.kernel

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
3 3
2+
0 0 0
3+
0 1 0
4+
0 0 0

kernels/square.kernel

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
11 11
2+
0 0 0 0 0 0 0 0 0 0 0
3+
0 0 0 0 0 0 0 0 0 0 0
4+
0 0 0 0 0 0 0 0 0 0 0
5+
0 0 0 0 0 0 0 0 0 0 0
6+
0 0 0 0 0 0 0 0 0 0 0
7+
0 0 0 0 0 0 0 0 0 0 0
8+
0 0 0 0 0 0 0 0 0 0 0
9+
0 0 0 0 0 0 0 0 0 0 0
10+
0 0 0 0 0 0 0 0 0 0 0
11+
0 0 0 0 0 0 0 0 0 0 0
12+
0 0 0 0 0 0 0 0 0 0 06

src/combinerMain.c

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
2+
#include <omp.h>
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
6+
#include "imageFormats/pgmformat.h"
7+
#include "job/job.h"
8+
#include "util/util.h"
9+
10+
void insertTileIntoRaster(int *raster, int rasterWidth, int rasterHeight, int *tilePixels, int blockWidth,
11+
int blockHeight, int blockX, int blockY) {
12+
int startX = blockX * blockWidth;
13+
int startY = blockY * blockHeight;
14+
for (int y = 0; y < blockHeight; y++) {
15+
for (int x = 0; x < blockWidth; x++) {
16+
int pixX = startX + x;
17+
int pixY = startY + y;
18+
if (pixX >= 0 && pixX < rasterWidth && pixY >= 0 && pixY < rasterHeight) {
19+
raster[pixY * rasterWidth + pixX] = tilePixels[y * blockWidth + x];
20+
}
21+
}
22+
}
23+
}
24+
25+
int *combineTiles(int width, int height, int blockWidth, int blockHeight, const char *tileDir) {
26+
int nVertImages = (height + (blockHeight - 1)) / blockHeight;
27+
int nHorImages = (width + (blockWidth - 1)) / blockWidth;
28+
int *raster = calloc(width * height, sizeof(int));
29+
#pragma omp parallel
30+
{
31+
char buffer[256];
32+
int *tilePixels = malloc(blockWidth * blockHeight * sizeof(int));
33+
#pragma omp for
34+
for (int y = 0; y < nVertImages; y++) {
35+
for (int x = 0; x < nHorImages; x++) {
36+
sprintf(buffer, "%s/block_x%d_y%d.pgm", tileDir, x, y);
37+
loadPgmImageToMem(buffer, tilePixels);
38+
insertTileIntoRaster(raster, width, height, tilePixels, blockWidth, blockHeight, x, y);
39+
}
40+
}
41+
free(tilePixels);
42+
}
43+
44+
return raster;
45+
}
46+
47+
int main(int argc, char *argv[]) {
48+
if (argc != 8) {
49+
printf(
50+
"Usage: ./combiner finalImgWidth finalImgHeight tileWidth tileHeight numBits outputFile "
51+
"inputTileDirectory\nExample:\n\t ./combiner 256 256 64 64 8 result.pgm outputTiles\n");
52+
return 0;
53+
}
54+
55+
int finalWidth = atoi(argv[1]);
56+
int finalHeight = atoi(argv[2]);
57+
int blockWidth = atoi(argv[3]);
58+
int blockHeight = atoi(argv[4]);
59+
int numBits = atoi(argv[5]);
60+
const char *path = argv[6];
61+
const char *tileDir = argv[7];
62+
63+
// make output tile directory. Should fail if it already exists
64+
int *pixels = combineTiles(finalWidth, finalHeight, blockWidth, blockHeight, tileDir);
65+
saveAsPgmImage(path, pixels, finalWidth, finalHeight, numBits);
66+
printf("Result saved to %s\n", path);
67+
free(pixels);
68+
return 0;
69+
}

0 commit comments

Comments
 (0)