Skip to content

Commit

Permalink
Merge pull request #7 from wihobbs/mpi-test
Browse files Browse the repository at this point in the history
mpi: support for MPI testing on LC hardware
  • Loading branch information
wihobbs authored Jan 11, 2024
2 parents aa6cab4 + 984fe3b commit 7bd74ef
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 2 deletions.
20 changes: 20 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,22 @@ default:
debug: t
FLUX_TESTS_LOGFILE: t
script:
- lstopo --of xml >$(hostname).xml
- export FLUX_HWLOC_XMLFILE=$(pwd)/$(hostname).xml
- !reference ['.build-core', 'script']
- cd $FLUX_BUILD_DIR
- make -j 32 check

.test-core-mpi:
extends: .lc-variables
variables:
PYTHON: "/usr/bin/python3"
script:
- export MPI_TESTS_DIRECTORY=$(pwd)/mpi
- export FTC_DIRECTORY=$(pwd)
- !reference ['.build-core', 'script']
- flux run -N2 $FLUX_BUILD_DIR/src/cmd/flux start $MPI_TESTS_DIRECTORY/outer_script.sh

## Job Specifications
corona-core-test:
extends:
Expand All @@ -60,3 +72,11 @@ quartz-core-test:
- .test-core
- .quartz
stage: test

corona-mpi-test:
extends:
- .test-core-mpi
- .corona
variables:
LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2"
stage: test
2 changes: 0 additions & 2 deletions .gitlab/builds.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
- git clone https://github.com/flux-framework/flux-core
- cd flux-core
- export FLUX_BUILD_DIR=$(pwd)
- lstopo --of xml >$(hostname).xml
- export FLUX_HWLOC_XMLFILE=$(pwd)/$(hostname).xml
- ./autogen.sh
- ./configure
- make -j 32
Expand Down
44 changes: 44 additions & 0 deletions mpi/abort.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/************************************************************\
* Copyright 2021 Lawrence Livermore National Security, LLC
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
*
* This file is part of the Flux resource manager framework.
* For details, see https://github.com/flux-framework.
*
* SPDX-License-Identifier: LGPL-3.0
\************************************************************/

#if HAVE_CONFIG_H
#include "config.h"
#endif
#include <stdlib.h>
#include <stdio.h>
#include <mpi.h>

int main (int argc, char *argv[])
{
int id, ntasks;
int abort_rank = -1;

if (argc == 2)
abort_rank = strtol (argv[1], NULL, 10);

MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &id);
MPI_Comm_size (MPI_COMM_WORLD, &ntasks);

printf ("Hello World from rank %d\n", id);

if (id == abort_rank) {
fprintf (stderr, "Rank %d is going to MPI_Abort now\n", id);
MPI_Abort (MPI_COMM_WORLD, 42);
}
MPI_Barrier (MPI_COMM_WORLD);

MPI_Finalize ();

return 0;
}

// vi: ts=4 sw=4 expandtab

96 changes: 96 additions & 0 deletions mpi/hello.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/************************************************************\
* Copyright 2014 Lawrence Livermore National Security, LLC
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
*
* This file is part of the Flux resource manager framework.
* For details, see https://github.com/flux-framework.
*
* SPDX-License-Identifier: LGPL-3.0
\************************************************************/

#if HAVE_CONFIG_H
#include "config.h"
#endif
#include <mpi.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>

static struct timespec ts_diff (struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec-start.tv_nsec)<0) {
temp.tv_sec = end.tv_sec-start.tv_sec-1;
temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
} else {
temp.tv_sec = end.tv_sec-start.tv_sec;
temp.tv_nsec = end.tv_nsec-start.tv_nsec;
}
return temp;
}

double monotime_since (struct timespec t0)
{
struct timespec ts, d;
clock_gettime (CLOCK_MONOTONIC, &ts);

d = ts_diff (t0, ts);

return ((double) d.tv_sec * 1000 + (double) d.tv_nsec / 1000000);
}

void monotime (struct timespec *tp)
{
clock_gettime (CLOCK_MONOTONIC, tp);
}

bool monotime_isset (struct timespec t)
{
return (t.tv_sec || t.tv_nsec);
}

int main (int argc, char *argv[])
{
int id, ntasks;
struct timespec t;
const char *label;

if (!(label = getenv ("FLUX_JOB_CC")))
if (!(label = getenv ("FLUX_JOB_ID")))
label = "0";

monotime (&t);
MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &id);
MPI_Comm_size (MPI_COMM_WORLD, &ntasks);
if (id == 0) {
printf ("%s: completed MPI_Init in %0.3fs. There are %d tasks\n",
label,
monotime_since (t) / 1000, ntasks);
fflush (stdout);
}

monotime (&t);
MPI_Barrier (MPI_COMM_WORLD);
if (id == 0) {
printf ("%s: completed first barrier in %0.3fs\n",
label,
monotime_since (t) / 1000);
fflush (stdout);
}

monotime (&t);
MPI_Finalize ();
if (id == 0) {
printf ("%s: completed MPI_Finalize in %0.3fs\n",
label,
monotime_since (t) / 1000);
fflush (stdout);
}
return 0;
}

// vi: ts=4 sw=4 expandtab

28 changes: 28 additions & 0 deletions mpi/inner_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#! /bin/bash

die () {
rm -rf $FTC_DIRECTORY/$NAME
echo "$@"
exit 1
}

BATCH_NNODES=$(flux resource list -n -o {nnodes})
BATCH_NCORES=$(flux resource list -n -o {ncores})
COMPILER=$1
MPI=$2
export NAME="$COMPILER"_"$MPI"

test -n $COMPILER || die "COMPILER (argument 1) not set"
test -n $MPI || die "MPI (argument 2) not set"
module load $COMPILER || die "Compiler $COMPILER is unavailable on $LCSCHEDCLUSTER"
module load $MPI || die "MPI implementation $MPI is unavailable on $LCSCHEDCLUSTER"
test -n $FTC_DIRECTORY || die "FTC_DIRECTORY not set"
mkdir $FTC_DIRECTORY/$NAME || die "Unable to create directory for $FTC_DIRECTORY/$NAME"
cp -r $MPI_TESTS_DIRECTORY/* $FTC_DIRECTORY/$NAME
cd $FTC_DIRECTORY/$NAME || die "Could not find $FTC_DIRECTORY/$NAME"
echo "Running with $1 compiler and $2 MPI"
flux bulksubmit -n1 --watch mpicc -o {} {}.c ::: $TESTS || die "Compilation failure in tests"
flux bulksubmit --watch -N $BATCH_NNODES -n $BATCH_NCORES --output=kvs ./{} ::: $TESTS
RC=$?
rm -rf $FTC_DIRECTORY/$NAME
exit $RC
35 changes: 35 additions & 0 deletions mpi/outer_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

## REQUIRES: $MPI_TESTS_DIRECTORY $FTC_DIRECTORY $LCSCHEDCLUSTER

corona_COMPILERS="
gcc
clang
intel-classic
"

corona_MPIS="
mvapich2
"

export TESTS="hello
abort
version
"

MPIS="${LCSCHEDCLUSTER}_MPIS"
COMPILERS="${LCSCHEDCLUSTER}_COMPILERS"

for mpi in ${!MPIS}; do
for compiler in ${!COMPILERS}; do
flux batch -N2 -n4 --flags=waitable --output=kvs $MPI_TESTS_DIRECTORY/inner_script.sh $mpi $compiler
done
done
flux job wait --all
RC=$?
for id in $(flux jobs -a -no {id}); do
printf "\033[31mjob $id completed:\033[0m\n"
flux job attach $id
done

exit $RC
35 changes: 35 additions & 0 deletions mpi/version.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/************************************************************\
* Copyright 2021 Lawrence Livermore National Security, LLC
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
*
* This file is part of the Flux resource manager framework.
* For details, see https://github.com/flux-framework.
*
* SPDX-License-Identifier: LGPL-3.0
\************************************************************/

#if HAVE_CONFIG_H
#include "config.h"
#endif
#include <mpi.h>
#include <stdio.h>

int main (int argc, char *argv[])
{
char version[MPI_MAX_LIBRARY_VERSION_STRING];
int len;
int exit_rc = -1;

MPI_Get_library_version (version, &len);
if (len < 0) {
fprintf (stderr, "MPI_Get_library_version failed\n");
goto done;
}
printf ("%s\n", version);
exit_rc = 0;
done:
return exit_rc;
}

// vi: ts=4 sw=4 expandtab

0 comments on commit 7bd74ef

Please sign in to comment.