src/test/mpi-laplacian.c
Parallel scalability
This code can be used to test the scalability of common operations (traversal, restriction, boundary conditions etc…) and their combinations, in particular the multigrid Poisson solver.
#include "poisson.h"
#include "utils.h"
scalar a[], b[];
static void mpi_print (timer t, int i, long tnc,
const char * name)
{
double mpi[npe()];
timing s = timer_timing (t, i, tnc, mpi);
#if 0
scalar wt[];
foreach()
wt[] = mpi[pid()];
char fname[80];
sprintf (fname, "%s-%d.ppm", name, npe());
FILE * fp = pid() ? NULL : fopen (fname, "w");
output_ppm (wt, fp, n = 512);
#endif
if (pid() == 0) {
s.min/i, s.avg/i, s.max/i are the minimum, average and maximum *times spent in communication routines.
printf ("%d %g %g %g %s %g %g %g %.2g%% %ld %ld",
npe(), s.cpu/i, s.real/i, s.speed, name, s.min/i, s.avg/i, s.max/i,
100.*s.avg/s.real, s.mem, s.tnc/i);
We also output the times spent in communication routines for each process.
#if 0
for (int j = 0; j < npe(); j++)
printf (" %g", mpi[j]/i);
#endif
If memory profiling is enabled we output the maximum allocated memory (in kilobytes).
@if MTRACE
printf (" %ld", pmtrace.max/1024);
@endif
printf ("\n");
}
MPI_Barrier (MPI_COMM_WORLD);
}
int main (int argc, char * argv[])
{
int maxlevel = argc > 1 ? atoi(argv[1]) : (dimension == 2 ? 8 : 5);
timer t;
double speed = 1e6; // approx speed in points/sec/core
long tnc = ((long)1) << (dimension*maxlevel);
int i, nloops = max(0.1*speed*npe()/(double)tnc, 1);
if (!pid())
fprintf (stderr, "grid: %s\nnloops = %d\n", GRIDNAME, nloops);
if (tnc*nloops/(speed*npe()) > 100.) {
fprintf (stderr, "this run would probably take more than 100 seconds."
" Aborting ...\n");
exit(1);
}
size (1[0]);
init_grid (N);
foreach()
a[] = 0., b[] = 0.;
poisson (a, b); // to force allocation of extra fields
MPI_Barrier (MPI_COMM_WORLD);
t = timer_start();
init_grid (1 << maxlevel);
tnc = 0;
foreach(reduction(+:tnc))
tnc++;
mpi_print (t, 1, tnc, "refine");
#if TREE
assert (tree_is_full());
#endif
We fill a
with a simple function.
MPI_Barrier (MPI_COMM_WORLD);
i = 0;
t = timer_start();
while (i < nloops) {
foreach()
a[] = cos(pi*x)*cos(pi*y)*cos(pi*z);
#if 0
boundary ({a});
#else
prof_start ("barrier");
MPI_Barrier (MPI_COMM_WORLD);
prof_stop();
#endif
i++;
}
mpi_print (t, i, tnc*i, "cos");
boundary ({a});
Here we compute \displaystyle b = \nabla^2 a using a 5-points Laplacian operator.
MPI_Barrier (MPI_COMM_WORLD);
i = 0;
t = timer_start();
while (i < nloops) {
foreach() {
b[] = 0.;
foreach_dimension()
b[] += a[1] + a[-1];
b[] = (b[] - 2.*dimension*a[])/sq(Delta);
}
boundary ({b});
i++;
}
mpi_print (t, i, tnc*i, "laplacian");
Something simpler: the sum of a
over the entire mesh.
MPI_Barrier (MPI_COMM_WORLD);
i = 0;
t = timer_start();
double sum = 0.;
while (i < nloops) {
sum = 0.;
foreach(reduction(+:sum))
sum += b[];
i++;
}
mpi_print (t, i, tnc*i, "sum");
if (pid() == 0)
fprintf (stderr, "sum: %g\n", sum);
The restriction operator.
MPI_Barrier (MPI_COMM_WORLD);
i = 0;
t = timer_start();
while (i < nloops) {
boundary ({b});
restriction ({b});
i++;
}
mpi_print (t, i, tnc*i, "restriction");
And finally the Poisson solver.
MPI_Barrier (MPI_COMM_WORLD);
i = 0;
TOLERANCE = HUGE;
t = timer_start();
while (i < nloops) {
poisson (a, b);
i++;
}
mpi_print (t, i, tnc*i, "poisson");
scalar e[];
foreach()
e[] = a[] - cos(pi*x)*cos(pi*y)*cos(pi*z);
double max = normf(e).max;
if (pid() == 0)
fprintf (stderr, "error: %g\n", max);
assert (normf(e).max < 1e-10);
// output_ppm (e, file = "error.png");
sum = 0.;
int n = 0;
foreach (serial) {
e[] = (long) &(a[]);
sum += e[];
n++;
}
foreach()
e[] -= sum/n;
#if 0
FILE * fp = pid() ? NULL : fopen ("map.ppm", "w");
output_ppm (e, fp, n = 512);
#endif
int nmin = n, nmax = n;
mpi_all_reduce (nmin, MPI_INT, MPI_MIN);
mpi_all_reduce (nmax, MPI_INT, MPI_MAX);
if (pid() == 0)
fprintf (stderr, "balance %d %d\n", nmin, nmax);
}
How to run on Occigen
This test is run on Occigen. The C99 source code is generated on a system with qcc installed and then copied to Occigen using something like
qcc -grid=octree -source mpi-laplacian.c
scp _mpi-laplacian.c popinet@occigen.cines.fr:
On Occigen one can then compile the code using
module load bullxmpi
module load intel
mpicc -Wall -std=c99 -O2 -D_MPI=1 -D_GNU_SOURCE=1 _mpi-laplacian.c \
-o mpi-laplacian -lm
The following script (run.sh) can then be used to run the code
#!/bin/bash
#SBATCH -J basilisk
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --ntasks-per-node=2
#SBATCH --threads-per-core=1
#SBATCH --time=00:10:00
#SBATCH --output basilisk.output
#SBATCH --exclusive
LEVEL=11
module purge
module load bullxmpi
module load intel
srun --mpi=pmi2 -K1 --resv-ports -n $SLURM_NTASKS ./mpi-laplacian $LEVEL \
2> log-$LEVEL-$SLURM_NTASKS > out-$LEVEL-$SLURM_NTASKS
LEVEL is varied from 9 (~134 million grid points) to 11 (~8.6 billion grid points) and the number of processes up to 16384 using something like
sbatch --ntasks=1 --ntasks-per-node=1 run.sh
sbatch --ntasks=2 --ntasks-per-node=2 run.sh
sbatch --ntasks=4 --ntasks-per-node=4 run.sh
sbatch --ntasks=8 --ntasks-per-node=8 run.sh
sbatch --ntasks=16 --ntasks-per-node=16 run.sh
sbatch --ntasks=32 --ntasks-per-node=16 --nodes=2 run.sh
sbatch --ntasks=64 --ntasks-per-node=16 --nodes=4 run.sh
sbatch --ntasks=128 --ntasks-per-node=16 --nodes=8 run.sh
sbatch --ntasks=256 --ntasks-per-node=16 --nodes=16 run.sh
sbatch --ntasks=512 --ntasks-per-node=16 --nodes=32 run.sh
sbatch --ntasks=1024 --ntasks-per-node=16 --nodes=64 run.sh
sbatch --ntasks=2048 --ntasks-per-node=16 --nodes=128 run.sh
sbatch --ntasks=4096 --ntasks-per-node=16 --nodes=256 run.sh
sbatch --ntasks=8192 --ntasks-per-node=16 --nodes=512 run.sh
sbatch --ntasks=16384 --ntasks-per-node=24 --nodes=683 run.sh
The results can then be collected using
tar czvf occigen.tgz out-*-*
Results on Occigen
The memory usage per core is given below. The increase for a large number of cores corresponds to the memory overhead of communication buffers etc…
# generate results for Occigen
cd 'occigen/3D'
# generate weak scaling curves
! bash weak.sh > weak
set logscale
set logscale x 2
set grid
set xrange [2:32768]
set format x '2^{%L}'
set xtics 2
set xlabel "# of cores"
set ylabel "Memory/core (GB)"
minlevel=9
maxlevel=11
plot [][0.1:] for [i=minlevel:maxlevel] \
'< sh table.sh poisson '.i u 1:($2/$1) t ''.i.' levels' w lp, \
18/x**0.9 t '18/x^{0.9}'
cd '../..'
The wall-clock time for one iteration of the multigrid Poisson solver is given below.
The pink lines connect points corresponding with weak (or iso-granular) scaling i.e. multiplying by eight both the computation size and the number of cores. The ideal weak scaling would give horizontal lines (i.e. constant computation time for proportionally-increasing problem sizes and number of cores).
cd 'occigen/3D'
set ylabel 'Time (sec)'
plot [][0.01:100] for [i=minlevel:maxlevel] \
'< sh time.sh poisson '.i u 1:2 t ''.i.' levels' w lp, \
'weak' u 1:2 w lp t 'weak scaling', \
600/x**0.95 t '600/x^{0.95}'
cd '../..'
The time spent in communication routines is illustrated below.
cd 'occigen/3D'
plot [][1e-2:10] for [i=minlevel:maxlevel] \
'< sh time.sh poisson '.i u 1:3 w lp t ''.i.' levels', \
4.5/x**0.65 t '4.5/x^{0.65}'
cd '../..'
Similar results are obtained for a pure Laplacian.
cd 'occigen/3D'
plot [][1e-4:10] for [i=minlevel:maxlevel] \
'< sh time.sh laplacian '.i u 1:2 w lp t ''.i.' levels', \
50/x**0.93 t '50/x^{0.93}'
cd '../..'
cd 'occigen/3D'
plot [][1e-4:1] for [i=minlevel:maxlevel] \
'< sh time.sh laplacian '.i u 1:3 w lp t ''.i.' levels', \
2./x**0.7 t '2/x^{0.7}'
cd '../..'
And for the restriction operator.
cd 'occigen/3D'
plot [][:1] for [i=minlevel:maxlevel] \
'< sh time.sh restriction '.i u 1:2 w lp t ''.i.' levels', \
18/x**0.85 t '18/x^{0.85}'
cd '../..'
cd 'occigen/3D'
plot [][1e-3:1] for [i=minlevel:maxlevel] \
'< sh time.sh restriction '.i u 1:3 w lp t ''.i.' levels', \
2.8/x**0.66 t '2.8/x^{0.66}'
cd '../..'