Mandelbrot set by MPI/OpenMP/OpenACC.
requirements for python:
$ export FC=ifort
$ make a.out
$ vi fort.11 # adjust the parameters
$ ./a.out
maximum iteration: 200
imax: 301 jmax: 251
time[s]: 1.210000000000000E-002
./draw.py
$ export MPIFC=mpiifort
$ make a.out.mpi
$ vi fort.11 # adjust the parameters
$ mpirun -np $NP ./a.out.mpi # where $NP must equal to np_i*np_j in fort.11
maximum iteration: 200
imax: 301 jmax: 251
time[s]: 7.543087005615234E-003
./draw_mpi.py
$ export MPIFC=mpif90
$ make a.out.mpi.acc
$ vi fort.11 # adjust the parameters
$ mpirun -np $NP ./a.out.mpi # where $NP must equal to np_i*np_j in fort.11
maximum iteration: 200
imax: 301 jmax: 251
time[s]: 7.543087005615234E-003
./draw_mpi.py
$ display mandelbrot.png
or$ display mandelbrot_mpi.png
or
$ gnuplot
gnuplot> set pm3d map
gnuplot> splot "fort.100"
MPI version can not use gnuplot because the output is a binary.
¶ms
iter_max = 200
dx = 5.0d-4
dy = 5.0d-4
x_min = -1.2d0
x_max = -1.1d0
y_min = 0.2d0
y_max = 0.3d0
tol = 1.0d2
/
CPU: Intel(R) Xeon(R) CPU E5-2450 0 @ 2.10GHz, 8 cores/socket, 2 sockets, 8 nodes
interconnect: 4xFDR infiniband, fat tree
compiler: intel compiler 2018u0
MPI: intel MPI 2018u0
¶ms
iter_max = 200
dx = 2.0d-4
dy = 2.0d-4
x_min = -2.25d0
x_max = 0.75d0
y_min = -1.25d0
y_max = 1.25d0
tol = 1.0d2
/
$ OMP_NUM_THREADS=1 KMP_AFFINITY=compact srun --mpi=pmi2 -N1 -n1 -c1 --cpu_bind=cores -m block:block ./a.out
maximum iteration: 200
imax: 15001 jmax: 12501
time[s]: 104.264600000000
$ OMP_NUM_THREADS=16 KMP_AFFINITY=compact srun -n1 -c16 --cpu_bind=cores -m block:block ./a.out
srun: Warning: can't run 1 processes on 8 nodes, setting nnodes to 1
maximum iteration: 200
imax: 15001 jmax: 12501
time[s]: 15.1660000000000
$ I_MPI_EXTRA_FILESYSTEM=1 I_MPI_EXTRA_FILESYSTEM_LIST=lustre OMP_NUM_THREADS=1 KMP_AFFINITY=compact srun --mpi=pmi2 -N1 -n16 -c1 --cpu_bind=cores -m block:block ./a.out.mpi
maximum iteration: 200
imax: 15001 jmax: 12501
time[s]: 23.8055260181427
$ I_MPI_EXTRA_FILESYSTEM=1 I_MPI_EXTRA_FILESYSTEM_LIST=lustre OMP_NUM_THREADS=8 KMP_AFFINITY=compact srun --mpi=pmi2 -N1 -n2 -c8 --cpu_bind=cores -m block:block ./a.out.mpi
maximum iteration: 200
imax: 15001 jmax: 12501
time[s]: 15.0692129135132
$ I_MPI_EXTRA_FILESYSTEM=1 I_MPI_EXTRA_FILESYSTEM_LIST=lustre OMP_NUM_THREADS=1 KMP_AFFINITY=compact srun --mpi=pmi2 -N8 -n128 -c1 --cpu_bind=cores -m block:block ./a.out.mpi
maximum iteration: 200
imax: 15001 jmax: 12501
time[s]: 3.35678577423096
hybrid(8 nodes, 128 cores, np_i=4, np_j=4, 8 threads/process)
$ I_MPI_EXTRA_FILESYSTEM=1 I_MPI_EXTRA_FILESYSTEM_LIST=lustre OMP_NUM_THREADS=8 KMP_AFFINITY=compact srun --mpi=pmi2 -N8 -n16 -c8 --cpu_bind=cores -m block:block ./a.out.mpi
maximum iteration: 200
imax: 15001 jmax: 12501
time[s]: 3.36353015899658
OpenACC(4 nodes, 16 processes, np_i=4, np_j=4, 1 GPU/process) GPU: Tesla P100 x4/node
$ mpirun -x PATH -x LD_LIBRARY_PATH -np 16 -npernode 4 ./a.out.mpi.acc
maximum iteration: 200
imax: 15001 jmax: 12501
time[s]: 9.6851900219917297E-002