Commit 17f2de8f authored by wadaniel's avatar wadaniel

added hw3

parent 22ce7da1
CXX=h5pcc
CXXFLAGS = -Wpedantic -Wall -Wextra -std=c++11 -lstdc++ -g -O3 -fopenmp
all: main
%.o: %.cpp
$(CXX) -c -o $@ $< $(CXXFLAGS)
clean:
rm -rf *.o *.xmf *h5 *txt main
main: main.o auxiliary.o wave.o
$(CXX) $(CXXFLAGS)-I. -o main $^
run_mpi:
make clean ; make ; mpirun -n 64 ./main 512 4 0.25
run_hybrid_v1:
export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 --map-by ppr:3:node ./main 512 2 0.25
run_hybrid_v2:
export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 --map-by ppr:2:node ./main 512 2 0.25
develop:
export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 ./main 256 2 0.1
.PHONY: all clean run_mpi run_hybrid_v1 run_hybrid_v2 develop
HOW TO RUN
----------
To run the code on Euler remove all previously loaded nodules (purge) and load the following modules:
module purge;
module load new;
module load gcc;
module load open_mpi/3.0.0;
module load hdf5;
or run `source hw3.src`
CORES NEEDED
------------
You will need a total of 64 cores. You can request them interactively by using the command
bsub -n 72 -R "select[model==XeonE5_2680v3]fullnode" -W 04:00 -Is bash
which will reserve three EulerII nodes (2 sockets each, 12 cores per socket).
Then, you can type `make run_hybrid_v1` to execute the following commands
export OMP_NUM_THREADS=8 ;
make clean ;
make ;
mpirun -n 8 --map-by ppr:3:node ./main 512 2 0.25
This will map 3 MPI processes per node and make each process use 8 OPENMP threads (so the whole node of 24 cores will be used).
To improve performance, you can instead map 2 MPI processes per node by using make run_hybrid_v2
export OMP_NUM_THREADS=8 ;
make clean ;
make ;
mpirun -n 8 --map-by ppr:2:node ./main 512 2 0.25
Having two processes per node will map each process to one of the two sockets of that node;
if you have three processes per node then one processes will have 4 cores on one socket and 4 cores on another, which is slower.
In that case, you will need to ask for 4 interactive nodes:
bsub -n 96 -R "select[model==XeonE5_2680v3]fullnode" -W 04:00 -Is bash
Asking for too many cores interactively may require some waiting time.
Alternatively, you may submit your jobs by using make submit (see Makefile), or you may ask for only one full node
bsub -n 24 -R "select[model==XeonE5_2680v3]fullnode" -W 04:00 -Is bash
and run your code by typing make develop (this will be slow, but you may use it to test the correctness of your results).
*** IMPORTANT ***
You should type the command
unset LSB_AFFINITY_HOSTFILE
before running a hybrid code on Euler; this will allow you to specify thread and process affinity using the directives --map-by etc.
Otherwise, those directives are ignored.
WHAT TO MODIFY
--------------
You only need to change the code in wave.cpp
When the code runs, it prints a checksum. This number should be the same for the solution you provide.
CODE OUTPUT
-----------
You are not asked to visualize any results in this question, but we thought I'd be nice if you can see what you are actually solving.
Other outputs are .h5 files which can be used to visualize the solution. Each .h5 file corresponds to an .xmf file.
You can use Paraview to open the .xmf files and visualize the results.
Alternatively, you may comment out the line
#define USE_HDF5
with '//' in wave.h. In this case, the code will output simple .txt files with the solution, saved as
x y z u
...
#include "wave.h"
//************************
// DO NOT MODIFY THIS FILE
//************************
WaveEquation::WaveEquation(int a_N, int a_procs_per_dim) {
MPI_Comm_size(MPI_COMM_WORLD, &size);
procs_per_dim = a_procs_per_dim;
Ntot = a_N;
h = L / a_N;
N = a_N / procs_per_dim;
// the chosen numerical method is stable if dt <= h/sqrt(3)
dt = h / sqrt(3.0);
aux = dt * dt / h / h;
nthreads = omp_get_max_threads();
threads_per_dim = pow(nthreads, 1. / 3.);
assert(N % threads_per_dim == 0);
FindCoordinates();
origin[0] = coords[0] * N * h;
origin[1] = coords[1] * N * h;
origin[2] = coords[2] * N * h;
u = new double[(N + 2) * (N + 2) * (N + 2)];
u_old = new double[(N + 2) * (N + 2) * (N + 2)];
u_new = new double[(N + 2) * (N + 2) * (N + 2)];
pack = new double *[6 * threads_per_dim * threads_per_dim];
unpack = new double *[6 * threads_per_dim * threads_per_dim];
// ********************************************************************************
// Question 1: Follow this example to parallelize the loop that updates grid
// points
// ********************************************************************************
#pragma omp parallel
{
int tid = omp_get_thread_num();
int ti0, ti1, ti2;
thread_coordinates(tid, threads_per_dim, ti0, ti1, ti2);
int nloc = N / threads_per_dim;
int i0_min = ti0 * nloc;
int i1_min = ti1 * nloc;
int i2_min = ti2 * nloc;
for (int i0 = i0_min; i0 < i0_min + nloc; i0++) {
double x0 = origin[0] + i0 * h + 0.5 * h;
for (int i1 = i1_min; i1 < i1_min + nloc; i1++) {
double x1 = origin[1] + i1 * h + 0.5 * h;
for (int i2 = i2_min; i2 < i2_min + nloc; i2++) {
double x2 = origin[2] + i2 * h + 0.5 * h;
int m = (i0 + 1) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2 + 1);
u[m] = Initial_Condition(x0, x1, x2);
u_new[m] = u[m];
u_old[m] = u[m];
// assuming that u_old = u is equivalent to du/dt(t=0) = 0
}
}
}
}
}
WaveEquation::~WaveEquation() {
delete[] u;
delete[] u_old;
delete[] u_new;
}
void WaveEquation::FindCoordinates() {
int p = procs_per_dim;
cart_comm = MPI_COMM_WORLD;
MPI_Comm_rank(cart_comm, &rank);
coords[0] = rank / (p * p);
coords[1] = (rank - coords[0] * (p * p)) / p;
coords[2] = (rank - coords[0] * (p * p) - coords[1] * p) % p;
int coor_0_plus = (coords[0] + 1 + p) % p;
int coor_1_plus = (coords[1] + 1 + p) % p;
int coor_2_plus = (coords[2] + 1 + p) % p;
int coor_0_minus = (coords[0] - 1 + p) % p;
int coor_1_minus = (coords[1] - 1 + p) % p;
int coor_2_minus = (coords[2] - 1 + p) % p;
rank_plus[0] = (p * p) * coor_0_plus + coords[1] * p + coords[2];
rank_plus[1] = coords[0] * p * p + p * coor_1_plus + coords[2];
rank_plus[2] = coords[0] * p * p + coords[1] * p + coor_2_plus;
rank_minus[0] = (p * p) * coor_0_minus + coords[1] * p + coords[2];
rank_minus[1] = coords[0] * p * p + p * coor_1_minus + coords[2];
rank_minus[2] = coords[0] * p * p + coords[1] * p + coor_2_minus;
}
void WaveEquation::pack_face(double *pack, int array_of_sizes[3],
int array_of_subsizes[3], int array_of_starts[3]) {
int p0 = array_of_subsizes[0];
int p1 = array_of_subsizes[1];
int p2 = array_of_subsizes[2];
int n1 = array_of_sizes[1];
int n2 = array_of_sizes[2];
for (int i0 = array_of_starts[0]; i0 < array_of_starts[0] + p0; i0++)
for (int i1 = array_of_starts[1]; i1 < array_of_starts[1] + p1; i1++)
for (int i2 = array_of_starts[2]; i2 < array_of_starts[2] + p2; i2++) {
int i = (i0 - array_of_starts[0]) * p1 * p2 +
(i1 - array_of_starts[1]) * p2 + (i2 - array_of_starts[2]);
pack[i] = *(u + i0 * n1 * n2 + i1 * n2 + i2);
}
}
void WaveEquation::unpack_face(double *pack, int array_of_sizes[3],
int array_of_subsizes[3],
int array_of_starts[3]) {
int p0 = array_of_subsizes[0];
int p1 = array_of_subsizes[1];
int p2 = array_of_subsizes[2];
int n1 = array_of_sizes[1];
int n2 = array_of_sizes[2];
for (int i0 = array_of_starts[0]; i0 < array_of_starts[0] + p0; i0++)
for (int i1 = array_of_starts[1]; i1 < array_of_starts[1] + p1; i1++)
for (int i2 = array_of_starts[2]; i2 < array_of_starts[2] + p2; i2++) {
int i = (i0 - array_of_starts[0]) * p1 * p2 +
(i1 - array_of_starts[1]) * p2 + (i2 - array_of_starts[2]);
*(u + i0 * n1 * n2 + i1 * n2 + i2) = pack[i];
}
}
void WaveEquation::thread_coordinates(int tid, int threads_per_dim, int &ti0,
int &ti1, int &ti2) {
ti0 = tid / (threads_per_dim * threads_per_dim);
ti1 = (tid - ti0 * threads_per_dim * threads_per_dim) / threads_per_dim;
ti2 = tid - ti0 * threads_per_dim * threads_per_dim - ti1 * threads_per_dim;
assert(tid ==
ti0 * threads_per_dim * threads_per_dim + ti1 * threads_per_dim + ti2);
}
double WaveEquation::Initial_Condition(double x0, double x1, double x2) {
double r = (x0 - 0.5) * (x0 - 0.5) + (x1 - 0.5) * (x1 - 0.5) +
(x2 - 0.5) * (x2 - 0.5);
return exp(-r / 0.1);
}
void WaveEquation::Print(int kt = 0) {
#ifdef USE_HDF5
std::string name = "Wave__" + std::to_string(kt);
if (rank == 0) {
FILE *xmf = 0;
xmf = fopen((name + ".xmf").c_str(), "w");
std::stringstream s_head;
s_head << "<?xml version=\"1.0\" ?>\n";
s_head << "<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n";
s_head << "<Xdmf Version=\"2.0\">\n";
s_head << " <Domain>\n";
s_head << " <Time Value=\"" << std::setprecision(10) << std::setw(10)
<< t << "\"/>\n\n";
fprintf(xmf, (s_head.str()).c_str());
std::stringstream s;
s << " <Grid GridType=\"Uniform\">\n";
s << " <Topology TopologyType=\"3DCoRectMesh\" Dimensions=\" "
<< Ntot + 1 << " " << Ntot + 1 << " " << Ntot + 1 << "\"/>\n\n";
s << " <Geometry GeometryType=\"ORIGIN_DXDYDZ\">\n";
s << " <DataItem Dimensions=\"3\" NumberType=\"Double\" "
"Precision=\"8\" Format=\"XML\">\n";
s << " " << 0 << " " << 0 << " " << 0 << "\n";
s << " </DataItem>\n";
s << " <DataItem Dimensions=\"3\" NumberType=\"Double\" "
"Precision=\"8\" Format=\"XML\">\n";
s << " " << std::setprecision(10) << std::setw(15) << h
<< " " << std::setw(15) << h << " " << std::setw(15) << h << "\n";
s << " </DataItem>\n";
s << " </Geometry>\n\n";
s << " <Attribute Name=\"data\" AttributeType=\" "
<< "Scalar"
<< "\" Center=\"Cell\">\n";
s << " <DataItem Dimensions=\" " << Ntot << " " << Ntot << " " << Ntot
<< " " << std::setw(10) << 1 << "\" NumberType=\"Float\" Precision=\" "
<< sizeof(double) << "\" Format=\"HDF\">\n";
std::stringstream name_ss;
name_ss << "data"; // << std::setfill('0') << std::setw(10) << mm ;
std::string tmp = name_ss.str();
s << " " << (name + ".h5").c_str() << ":/" << tmp << "\n";
s << " </DataItem>\n";
s << " </Attribute>\n";
s << " </Grid>\n";
std::string st = s.str();
fprintf(xmf, st.c_str());
std::stringstream s_tail;
s_tail << " </Domain>\n";
s_tail << "</Xdmf>\n";
fprintf(xmf, (s_tail.str()).c_str());
fclose(xmf);
}
hid_t file_id, dataset_id, fspace_id, fapl_id, mspace_id;
H5open();
fapl_id = H5Pcreate(H5P_FILE_ACCESS);
H5Pset_fapl_mpio(fapl_id, cart_comm, MPI_INFO_NULL);
file_id =
H5Fcreate((name + ".h5").c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
H5Pclose(fapl_id);
double *array_all = new double[N * N * N];
hsize_t count[4] = {static_cast<unsigned int>(N),
static_cast<unsigned int>(N),
static_cast<unsigned int>(N), 1};
hsize_t dims[4] = {static_cast<unsigned int>(Ntot),
static_cast<unsigned int>(Ntot),
static_cast<unsigned int>(Ntot), 1};
hsize_t offset[4] = {static_cast<unsigned int>(coords[0] * N),
static_cast<unsigned int>(coords[1] * N),
static_cast<unsigned int>(coords[2] * N), 0};
for (int i0 = 1; i0 <= N; i0++) {
for (int i1 = 1; i1 <= N; i1++) {
for (int i2 = 1; i2 <= N; i2++) {
array_all[(i2 - 1) + (i1 - 1) * N + (i0 - 1) * N * N] =
u[i2 + i1 * (N + 2) + i0 * (N + 2) * (N + 2)];
}
}
}
fapl_id = H5Pcreate(H5P_DATASET_XFER);
H5Pset_dxpl_mpio(fapl_id, H5FD_MPIO_COLLECTIVE);
fspace_id = H5Screate_simple(4, dims, NULL);
dataset_id = H5Dcreate(file_id, "data", H5T_NATIVE_DOUBLE, fspace_id,
H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
fspace_id = H5Dget_space(dataset_id);
H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);
mspace_id = H5Screate_simple(4, count, NULL);
H5Dwrite(dataset_id, H5T_NATIVE_DOUBLE, mspace_id, fspace_id, fapl_id,
array_all);
H5Sclose(mspace_id);
H5Sclose(fspace_id);
H5Dclose(dataset_id);
H5Pclose(fapl_id);
H5Fclose(file_id);
H5close();
delete[] array_all;
#else
std::string name = "Wave__" + std::to_string(kt) + ".txt";
MPI_File file;
MPI_File_open(cart_comm, name.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE,
MPI_INFO_NULL, &file);
std::stringstream ss;
for (int i0 = 0; i0 < N; i0++) {
double x0 = origin[0] + i0 * h + 0.5 * h;
for (int i1 = 0; i1 < N; i1++) {
double x1 = origin[1] + i1 * h + 0.5 * h;
for (int i2 = 0; i2 < N; i2++) {
double x2 = origin[2] + i2 * h + 0.5 * h;
ss << x0 << " " << x1 << " " << x2 << " "
<< u[(i2 + 1) + (i1 + 1) * (N + 2) + (i0 + 1) * (N + 2) * (N + 2)]
<< "\n";
}
}
}
std::string asciiData = ss.str();
MPI_Offset my_length = asciiData.size() * sizeof(char);
MPI_Offset offset;
MPI_Exscan(&my_length, &offset, 1, MPI_OFFSET, MPI_SUM, cart_comm);
MPI_File_write_at_all(file, offset, asciiData.data(), asciiData.size(),
MPI_CHAR, MPI_STATUS_IGNORE);
MPI_File_close(&file);
#endif
}
void WaveEquation::UpdateGridPoint(int i0, int i1, int i2) {
int m = i0 * (N + 2) * (N + 2) + i1 * (N + 2) + i2;
int ip1 = (i0 + 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2);
int im1 = (i0 - 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2);
int jp1 = (i0) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2);
int jm1 = (i0) * (N + 2) * (N + 2) + (i1 - 1) * (N + 2) + (i2);
int kp1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 + 1);
int km1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 - 1);
u_new[m] =
2.0 * u[m] - u_old[m] +
aux * (u[ip1] + u[im1] + u[jp1] + u[jm1] + u[kp1] + u[km1] - 6.0 * u[m]);
}
\ No newline at end of file
module purge;
module load new;
module load gcc;
module load open_mpi/3.0.0;
module load hdf5;
#include "wave.h"
int main(int argc, char **argv) {
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (argc != 4) {
if (rank == 0) {
std::cout << " Incorrect number of inputs. Run as:\n";
std::cout << " mpirun -n x ./main N y t \n";
std::cout << " where \n";
std::cout << " N= number of grid points per direction\n";
std::cout << " y= number of ranks per direction (=(x)^(1/3))\n";
std::cout << " t= final time of simulation \n";
std::cout << " Aborting...\n";
}
int err = 1;
MPI_Abort(MPI_COMM_WORLD, err);
}
int points = std::stoi(argv[1]);
int procs_per_dim = std::stoi(argv[2]);
double t_end = std::stof(argv[3]);
if (size != procs_per_dim * procs_per_dim * procs_per_dim) {
if (rank == 0)
std::cout << " Incorrect number of ranks per direction. Aborting... \n";
int err = 2;
MPI_Abort(MPI_COMM_WORLD, err);
}
if (points % procs_per_dim != 0) {
if (rank == 0)
std::cout << " Grid points per direction must be divisible by the number "
"of ranks per direction. Aborting... \n";
int err = 3;
MPI_Abort(MPI_COMM_WORLD, err);
}
int nthreads = omp_get_max_threads();
int threads_per_dim = pow(nthreads, 1.0 / 3.0);
if (nthreads != threads_per_dim * threads_per_dim * threads_per_dim) {
if (rank == 0)
std::cout
<< " Number of OPENMP threads must be a cubic number. Aborting... \n";
int err = 4;
MPI_Abort(MPI_COMM_WORLD, err);
}
WaveEquation Simulation(points, procs_per_dim);
Simulation.run(t_end);
MPI_Finalize();
return 0;
}
#include "wave.h"
//********************************************************
//YOU ONLY NEED TO MODIFY THIS FILE TO COMPLETE HOMEWORK 3
//********************************************************
void WaveEquation::pack_all() {
// ********************************************************************************
// Question 2: the number of allocated packs is 6 * threads_per_dim ^ 2
// as there are six faces per process and each face corresponds to
// threads_per_dim ^ 2 threads. Each pack is associated (numbered)
// with a unique ID (variable pid), that is a function of the face
// and the thread id (variable tid). Those id's are provided.
// Create a parallel region in this function and modify the inputs
// of pack_face accordingly, so that each thread packs the data in
// its own subdomain. Note that if threads_per_dim = 1, the id's
// of each pack reduce to the numbers 0,1,2,3,4,5.
// ********************************************************************************
int array_of_sizes[3] = {N + 2, N + 2, N + 2};
int nloc = N / threads_per_dim;
int p = threads_per_dim;
int tid = omp_get_thread_num();
int t0, t1, t2;
thread_coordinates(tid, threads_per_dim, t0, t1, t2);
if (t0 == p - 1) {
int pid = 0 * threads_per_dim * threads_per_dim + t1 * threads_per_dim + t2;
}
if (t0 == 0) {
int pid = 1 * threads_per_dim * threads_per_dim + t1 * threads_per_dim + t2;
}
if (t1 == p - 1) {
int pid = 2 * threads_per_dim * threads_per_dim + t0 * threads_per_dim + t2;
}
if (t1 == 0) {
int pid = 3 * threads_per_dim * threads_per_dim + t0 * threads_per_dim + t2;
}
if (t2 == p - 1) {
int pid = 4 * threads_per_dim * threads_per_dim + t0 * threads_per_dim + t1;
}
if (t2 == 0) {
int pid = 5 * threads_per_dim * threads_per_dim + t0 * threads_per_dim + t1;
}
// You will need to remove this part, when you fill the if statements above
{
int array_of_subsizes[3] = {1, N, N};
int array_of_starts[3] = {N, 1, 1};
pack_face(pack[0], array_of_sizes, array_of_subsizes, array_of_starts);
}
{
int array_of_subsizes[3] = {1, N, N};
int array_of_starts[3] = {1, 1, 1};
pack_face(pack[1], array_of_sizes, array_of_subsizes, array_of_starts);
}
{
int array_of_subsizes[3] = {N, 1, N};
int array_of_starts[3] = {1, N, 1};
pack_face(pack[2], array_of_sizes, array_of_subsizes, array_of_starts);
}
{
int array_of_subsizes[3] = {N, 1, N};
int array_of_starts[3] = {1, 1, 1};
pack_face(pack[3], array_of_sizes, array_of_subsizes, array_of_starts);
}
{
int array_of_subsizes[3] = {N, N, 1};
int array_of_starts[3] = {1, 1, N};
pack_face(pack[4], array_of_sizes, array_of_subsizes, array_of_starts);
}
{
int array_of_subsizes[3] = {N, N, 1};
int array_of_starts[3] = {1, 1, 1};
pack_face(pack[5], array_of_sizes, array_of_subsizes, array_of_starts);
}
}
void WaveEquation::unpack_all() {
// ********************************************************************************
// Question 2: the number of allocated unpacks is 6 * threads_per_dim ^ 2
// as there are six faces per process and each face corresponds to
// threads_per_dim ^ 2 threads. Each unpack is associated
// (numbered) with a unique ID (variable pid), that is a function
// of the face and the thread id (variable tid). Those id's are
// provided.
//
// Each unpack should correspond to the appropriate pack (they
// must have the same id).
//
// Create a parallel region in this function and modify the inputs
// of unpack_face accordingly, so that each thread unpacks the
// data in its own subdomain. Note that if threads_per_dim = 1,
// the id's of each unpack reduce to the numbers 0,1,2,3,4,5.
// ********************************************************************************
int array_of_sizes[3] = {N + 2, N + 2, N + 2};
int nloc = N / threads_per_dim;
int p = threads_per_dim;
int tid = omp_get_thread_num();
int t0, t1, t2;
thread_coordinates(tid, threads_per_dim, t0, t1, t2);
if (t0 == 0) {
int pid = 0 * threads_per_dim * threads_per_dim + t1 * threads_per_dim + t2;
}
if (t0 == p - 1) {
int pid = 1 * threads_per_dim * threads_per_dim + t1 * threads_per_dim + t2;
}
if (t1 == 0) {
int pid = 2 * threads_per_dim * threads_per_dim + t0 * threads_per_dim + t2;
}
if (t1 == p - 1) {
int pid = 3 * threads_per_dim * threads_per_dim + t0 * threads_per_dim + t2;
}
if (t2 == 0) {
int pid = 4 * threads_per_dim * threads_per_dim + t0 *