diff --git a/hw3/grade.py b/hw3/grade.py
index 6eab42f01b87e3a141ae23fbc585897ee9ae7cab..264d4f7ff109e1642cbfc462716a0ded548b0c71 100644
--- a/hw3/grade.py
+++ b/hw3/grade.py
@@ -14,8 +14,9 @@ exercise_conf = {
'Name' : 'Homework 3',
'Questions' : {
'Question 1': {'Total Points': 20},
- 'Question 2': {'Total Points': 20},
- 'Question 3': {'Total Points': 20}
+ 'Question 2': {'Total Points': 40},
+ 'Question 3': {'Total Points': 20},
+ 'Question 4': {'Total Points': 20}
}
}
diff --git a/hw3/solution/Makefile b/hw3/solution/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3e68d0337b606d2082eb8cedfe992d317b7d8ead
--- /dev/null
+++ b/hw3/solution/Makefile
@@ -0,0 +1,25 @@
+CXX=mpicxx #h5pcc
+CXXFLAGS = -Wpedantic -Wall -Wextra -std=c++11 -lstdc++ -g -O3 -fopenmp
+
+all: main
+%.o: %.cpp
+ $(CXX) -c -o $@ $< $(CXXFLAGS)
+clean:
+ rm -rf *.o *.xmf *h5 *txt main
+main: main.o auxiliary.o wave.o
+ $(CXX) $(CXXFLAGS)-I. -o main $^
+
+run_mpi:
+ make clean ; make ; mpirun -n 64 ./main 512 4 0.25
+
+run_hybrid_v1:
+ export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 --map-by ppr:3:node ./main 512 2 0.25
+
+run_hybrid_v2:
+ export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 --map-by ppr:2:node ./main 512 2 0.25
+
+develop:
+ export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 ./main 256 2 0.1
+
+
+.PHONY: all clean run_mpi run_hybrid_v1 run_hybrid_v2 develop
diff --git a/hw3/solution/auxiliary.cpp b/hw3/solution/auxiliary.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bc90da9f5026cdddbd12bc9c44fb8be1630798a
--- /dev/null
+++ b/hw3/solution/auxiliary.cpp
@@ -0,0 +1,317 @@
+#include "wave.h"
+
+WaveEquation::WaveEquation(int a_N, int a_procs_per_dim) {
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+ procs_per_dim = a_procs_per_dim;
+ Ntot = a_N;
+ h = L / a_N;
+ N = a_N / procs_per_dim;
+
+ // the chosen numerical method is stable if dt <= h/sqrt(3)
+ dt = h / sqrt(3.0);
+ aux = dt * dt / h / h;
+
+ nthreads = omp_get_max_threads();
+ threads_per_dim = pow(nthreads, 1. / 3.);
+
+ assert(N % threads_per_dim == 0);
+
+ FindCoordinates();
+
+ origin[0] = coords[0] * N * h;
+ origin[1] = coords[1] * N * h;
+ origin[2] = coords[2] * N * h;
+ u = new double[(N + 2) * (N + 2) * (N + 2)];
+
+ u_old = new double[(N + 2) * (N + 2) * (N + 2)];
+ u_new = new double[(N + 2) * (N + 2) * (N + 2)];
+
+ pack = new double *[6 * threads_per_dim * threads_per_dim];
+ unpack = new double *[6 * threads_per_dim * threads_per_dim];
+
+#pragma omp parallel
+ {
+ int tid = omp_get_thread_num();
+ int ti0, ti1, ti2;
+ thread_coordinates(tid, threads_per_dim, ti0, ti1, ti2);
+
+ int nloc = N / threads_per_dim;
+
+ int i0_min = ti0 * nloc;
+ int i1_min = ti1 * nloc;
+ int i2_min = ti2 * nloc;
+
+ for (int i0 = i0_min; i0 < i0_min + nloc; i0++) {
+ double x0 = origin[0] + i0 * h + 0.5 * h;
+ for (int i1 = i1_min; i1 < i1_min + nloc; i1++) {
+ double x1 = origin[1] + i1 * h + 0.5 * h;
+ for (int i2 = i2_min; i2 < i2_min + nloc; i2++) {
+ double x2 = origin[2] + i2 * h + 0.5 * h;
+ int m = (i0 + 1) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2 + 1);
+ u[m] = Initial_Condition(x0, x1, x2);
+ u_new[m] = u[m];
+ u_old[m] = u[m];
+ // assuming that u_old = u is equivalent to du/dt(t=0) = 0
+ }
+ }
+ }
+ }
+}
+
+WaveEquation::~WaveEquation() {
+ delete[] u;
+ delete[] u_old;
+ delete[] u_new;
+}
+
+void WaveEquation::FindCoordinates() {
+ int p = procs_per_dim;
+
+ cart_comm = MPI_COMM_WORLD;
+ MPI_Comm_rank(cart_comm, &rank);
+
+ coords[0] = rank / (p * p);
+
+ coords[1] = (rank - coords[0] * (p * p)) / p;
+
+ coords[2] = (rank - coords[0] * (p * p) - coords[1] * p) % p;
+
+ int coor_0_plus = (coords[0] + 1 + p) % p;
+ int coor_1_plus = (coords[1] + 1 + p) % p;
+ int coor_2_plus = (coords[2] + 1 + p) % p;
+
+ int coor_0_minus = (coords[0] - 1 + p) % p;
+ int coor_1_minus = (coords[1] - 1 + p) % p;
+ int coor_2_minus = (coords[2] - 1 + p) % p;
+
+ rank_plus[0] = (p * p) * coor_0_plus + coords[1] * p + coords[2];
+ rank_plus[1] = coords[0] * p * p + p * coor_1_plus + coords[2];
+ rank_plus[2] = coords[0] * p * p + coords[1] * p + coor_2_plus;
+
+ rank_minus[0] = (p * p) * coor_0_minus + coords[1] * p + coords[2];
+ rank_minus[1] = coords[0] * p * p + p * coor_1_minus + coords[2];
+ rank_minus[2] = coords[0] * p * p + coords[1] * p + coor_2_minus;
+}
+
+void WaveEquation::pack_face(double *pack, int array_of_sizes[3],
+ int array_of_subsizes[3], int array_of_starts[3]) {
+ int p0 = array_of_subsizes[0];
+ int p1 = array_of_subsizes[1];
+ int p2 = array_of_subsizes[2];
+
+ int n1 = array_of_sizes[1];
+ int n2 = array_of_sizes[2];
+
+ for (int i0 = array_of_starts[0]; i0 < array_of_starts[0] + p0; i0++)
+ for (int i1 = array_of_starts[1]; i1 < array_of_starts[1] + p1; i1++)
+ for (int i2 = array_of_starts[2]; i2 < array_of_starts[2] + p2; i2++) {
+ int i = (i0 - array_of_starts[0]) * p1 * p2 +
+ (i1 - array_of_starts[1]) * p2 + (i2 - array_of_starts[2]);
+
+ pack[i] = *(u + i0 * n1 * n2 + i1 * n2 + i2);
+ }
+}
+
+void WaveEquation::unpack_face(double *pack, int array_of_sizes[3],
+ int array_of_subsizes[3],
+ int array_of_starts[3]) {
+ int p0 = array_of_subsizes[0];
+ int p1 = array_of_subsizes[1];
+ int p2 = array_of_subsizes[2];
+
+ int n1 = array_of_sizes[1];
+ int n2 = array_of_sizes[2];
+
+ for (int i0 = array_of_starts[0]; i0 < array_of_starts[0] + p0; i0++)
+ for (int i1 = array_of_starts[1]; i1 < array_of_starts[1] + p1; i1++)
+ for (int i2 = array_of_starts[2]; i2 < array_of_starts[2] + p2; i2++) {
+ int i = (i0 - array_of_starts[0]) * p1 * p2 +
+ (i1 - array_of_starts[1]) * p2 + (i2 - array_of_starts[2]);
+
+ *(u + i0 * n1 * n2 + i1 * n2 + i2) = pack[i];
+ }
+}
+
+void WaveEquation::thread_coordinates(int tid, int threads_per_dim, int &ti0,
+ int &ti1, int &ti2) {
+ ti0 = tid / (threads_per_dim * threads_per_dim);
+ ti1 = (tid - ti0 * threads_per_dim * threads_per_dim) / threads_per_dim;
+ ti2 = tid - ti0 * threads_per_dim * threads_per_dim - ti1 * threads_per_dim;
+ assert(tid ==
+ ti0 * threads_per_dim * threads_per_dim + ti1 * threads_per_dim + ti2);
+}
+
+double WaveEquation::Initial_Condition(double x0, double x1, double x2) {
+ double r = (x0 - 0.5) * (x0 - 0.5) + (x1 - 0.5) * (x1 - 0.5) +
+ (x2 - 0.5) * (x2 - 0.5);
+ return exp(-r / 0.1);
+}
+
+// do not change this function
+void WaveEquation::Print(int kt = 0) {
+#ifdef USE_HDF5
+
+ std::string name = "Wave__" + std::to_string(kt);
+ if (rank == 0) {
+ FILE *xmf = 0;
+ xmf = fopen((name + ".xmf").c_str(), "w");
+
+ std::stringstream s_head;
+
+ s_head << "\n";
+ s_head << "\n";
+ s_head << "\n";
+ s_head << " \n";
+
+ s_head << " \n\n";
+
+ fprintf(xmf, (s_head.str()).c_str());
+
+ std::stringstream s;
+ s << " \n";
+ s << " \n\n";
+ s << " \n";
+ s << " \n";
+ s << " " << 0 << " " << 0 << " " << 0 << "\n";
+ s << " \n";
+ s << " \n";
+ s << " " << std::setprecision(10) << std::setw(15) << h
+ << " " << std::setw(15) << h << " " << std::setw(15) << h << "\n";
+ s << " \n";
+ s << " \n\n";
+
+ s << " \n";
+ s << " \n";
+ std::stringstream name_ss;
+ name_ss << "data"; // << std::setfill('0') << std::setw(10) << mm ;
+ std::string tmp = name_ss.str();
+ s << " " << (name + ".h5").c_str() << ":/" << tmp << "\n";
+ s << " \n";
+ s << " \n";
+
+ s << " \n";
+
+ std::string st = s.str();
+
+ fprintf(xmf, st.c_str());
+
+ std::stringstream s_tail;
+ s_tail << " \n";
+ s_tail << "\n";
+
+ fprintf(xmf, (s_tail.str()).c_str());
+ fclose(xmf);
+ }
+
+ hid_t file_id, dataset_id, fspace_id, fapl_id, mspace_id;
+
+ H5open();
+ fapl_id = H5Pcreate(H5P_FILE_ACCESS);
+
+ H5Pset_fapl_mpio(fapl_id, cart_comm, MPI_INFO_NULL);
+
+ file_id =
+ H5Fcreate((name + ".h5").c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+
+ H5Pclose(fapl_id);
+
+ double *array_all = new double[N * N * N];
+
+ hsize_t count[4] = {static_cast(N),
+ static_cast(N),
+ static_cast(N), 1};
+ hsize_t dims[4] = {static_cast(Ntot),
+ static_cast(Ntot),
+ static_cast(Ntot), 1};
+ hsize_t offset[4] = {static_cast(coords[0] * N),
+ static_cast(coords[1] * N),
+ static_cast(coords[2] * N), 0};
+
+ for (int i0 = 1; i0 <= N; i0++) {
+ for (int i1 = 1; i1 <= N; i1++) {
+ for (int i2 = 1; i2 <= N; i2++) {
+ array_all[(i2 - 1) + (i1 - 1) * N + (i0 - 1) * N * N] =
+ u[i2 + i1 * (N + 2) + i0 * (N + 2) * (N + 2)];
+ }
+ }
+ }
+ fapl_id = H5Pcreate(H5P_DATASET_XFER);
+ H5Pset_dxpl_mpio(fapl_id, H5FD_MPIO_COLLECTIVE);
+ fspace_id = H5Screate_simple(4, dims, NULL);
+
+ dataset_id = H5Dcreate(file_id, "data", H5T_NATIVE_DOUBLE, fspace_id,
+ H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+ fspace_id = H5Dget_space(dataset_id);
+
+ H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);
+
+ mspace_id = H5Screate_simple(4, count, NULL);
+
+ H5Dwrite(dataset_id, H5T_NATIVE_DOUBLE, mspace_id, fspace_id, fapl_id,
+ array_all);
+
+ H5Sclose(mspace_id);
+ H5Sclose(fspace_id);
+ H5Dclose(dataset_id);
+ H5Pclose(fapl_id);
+ H5Fclose(file_id);
+ H5close();
+
+ delete[] array_all;
+#else
+
+ std::string name = "Wave__" + std::to_string(kt) + ".txt";
+ MPI_File file;
+ MPI_File_open(cart_comm, name.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE,
+ MPI_INFO_NULL, &file);
+
+ std::stringstream ss;
+
+ for (int i0 = 0; i0 < N; i0++) {
+ double x0 = origin[0] + i0 * h + 0.5 * h;
+ for (int i1 = 0; i1 < N; i1++) {
+ double x1 = origin[1] + i1 * h + 0.5 * h;
+ for (int i2 = 0; i2 < N; i2++) {
+ double x2 = origin[2] + i2 * h + 0.5 * h;
+ ss << x0 << " " << x1 << " " << x2 << " "
+ << u[(i2 + 1) + (i1 + 1) * (N + 2) + (i0 + 1) * (N + 2) * (N + 2)]
+ << "\n";
+ }
+ }
+ }
+
+ std::string asciiData = ss.str();
+
+ MPI_Offset my_length = asciiData.size() * sizeof(char);
+
+ MPI_Offset offset;
+ MPI_Exscan(&my_length, &offset, 1, MPI_OFFSET, MPI_SUM, cart_comm);
+
+ MPI_File_write_at_all(file, offset, asciiData.data(), asciiData.size(),
+ MPI_CHAR, MPI_STATUS_IGNORE);
+ MPI_File_close(&file);
+
+#endif
+}
+
+// do not change this function
+void WaveEquation::UpdateGridPoint(int i0, int i1, int i2) {
+ int m = i0 * (N + 2) * (N + 2) + i1 * (N + 2) + i2;
+ int ip1 = (i0 + 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2);
+ int im1 = (i0 - 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2);
+ int jp1 = (i0) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2);
+ int jm1 = (i0) * (N + 2) * (N + 2) + (i1 - 1) * (N + 2) + (i2);
+ int kp1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 + 1);
+ int km1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 - 1);
+ u_new[m] =
+ 2.0 * u[m] - u_old[m] +
+ aux * (u[ip1] + u[im1] + u[jp1] + u[jm1] + u[kp1] + u[km1] - 6.0 * u[m]);
+}
\ No newline at end of file
diff --git a/hw3/solution/main.cpp b/hw3/solution/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d43863f2ca89e50b4b7c0c2084fa3392460a0518
--- /dev/null
+++ b/hw3/solution/main.cpp
@@ -0,0 +1,60 @@
+#include "wave.h"
+
+int main(int argc, char **argv) {
+ int provided;
+ MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+
+ int rank, size;
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ if (argc != 4) {
+ if (rank == 0) {
+ std::cout << " Incorrect number of inputs. Run as:\n";
+ std::cout << " mpirun -n x ./main N y t \n";
+ std::cout << " where \n";
+ std::cout << " N= number of grid points per direction\n";
+ std::cout << " y= number of ranks per direction (=(x)^(1/3))\n";
+ std::cout << " t= final time of simulation \n";
+ std::cout << " Aborting...\n";
+ }
+ int err = 1;
+ MPI_Abort(MPI_COMM_WORLD, err);
+ }
+
+ int points = std::stoi(argv[1]);
+ int procs_per_dim = std::stoi(argv[2]);
+ double t_end = std::stof(argv[3]);
+
+ if (size != procs_per_dim * procs_per_dim * procs_per_dim) {
+ if (rank == 0)
+ std::cout << " Incorrect number of ranks per direction. Aborting... \n";
+ int err = 2;
+ MPI_Abort(MPI_COMM_WORLD, err);
+ }
+
+ if (points % procs_per_dim != 0) {
+ if (rank == 0)
+ std::cout << " Grid points per direction must be divisible by the number "
+ "of ranks per direction. Aborting... \n";
+ int err = 3;
+ MPI_Abort(MPI_COMM_WORLD, err);
+ }
+
+ int nthreads = omp_get_max_threads();
+ int threads_per_dim = pow(nthreads, 1.0 / 3.0) + 0.5;
+ if (nthreads != threads_per_dim * threads_per_dim * threads_per_dim) {
+ if (rank == 0)
+ std::cout
+ << " Number of OPENMP threads must be a cubic number. Aborting... \n";
+ int err = 4;
+ MPI_Abort(MPI_COMM_WORLD, err);
+ }
+
+ WaveEquation Simulation(points, procs_per_dim);
+
+ Simulation.run(t_end);
+
+ MPI_Finalize();
+ return 0;
+}
diff --git a/hw3/solution/wave.cpp b/hw3/solution/wave.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a3684ec58dce2be7ef9735b5048f1295d23c48b
--- /dev/null
+++ b/hw3/solution/wave.cpp
@@ -0,0 +1,360 @@
+#include "wave.h"
+
+/*
+Solution notes
+
+Each thread is associated with a set of thread coordinates (t0,t1,t2) and the number of threads
+per direction is p (so the total number of threads is p^3).
+
+A thread lies at the boundary of a process if ti=0 and/or if ti=p-1, for i=0,1,2.
+
+Threads at the boundary are associated with a unique send/pack id and a unique receive/unpack id.
+Those ids are used to properly pack/unpack data and to match message tags when sending/receiving.
+
+
+
+Each MPI process is assigned a cubic part of the domain. Thus, there are 6 faces where data needs
+to be exchanged. Each face has p^2 threads. This means that there are 6p^2 send/pack ids and
+6p^2 unpack/receive ids.
+
+Logically, those ids correspond to a 3-dimensional array, with sizes [6][p][p]. They are stored in
+1-dimensional arrays, so that
+[i][j][k] = i * p^2 + j * p + k
+i=0,1,...,5
+j=0,1,...,p-1
+k=0,1,...,p-1
+
+
+In the present solution, send ids correspond to receive ids as follows
+
+send_id[0][j][k] = receive_id[1][j][k]
+send_id[1][j][k] = receive_id[0][j][k]
+send_id[2][j][k] = receive_id[3][j][k]
+send_id[3][j][k] = receive_id[2][j][k]
+send_id[4][j][k] = receive_id[5][j][k]
+send_id[5][j][k] = receive_id[4][j][k]
+for all j,k=0,...,p-1
+
+*/
+
+
+
+void WaveEquation::pack_all() {
+ int array_of_sizes[3] = {N + 2, N + 2, N + 2};
+ int nloc = N / threads_per_dim;
+ int p = threads_per_dim;
+ int tid = omp_get_thread_num();
+ int t0, t1, t2;
+ thread_coordinates(tid, threads_per_dim, t0, t1, t2);
+
+ if (t0 == p - 1) {
+ int pid = 0 * p * p + t1 * p + t2;
+ int array_of_subsizes[3] = {1, nloc, nloc};
+ int array_of_starts[3] = {N, 1 + t1 * nloc, 1 + t2 * nloc};
+ pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts);
+ }
+ if (t0 == 0) {
+ int pid = 1 * p * p + t1 * p + t2;
+ int array_of_subsizes[3] = {1, nloc, nloc};
+ int array_of_starts[3] = {1, 1 + t1 * nloc, 1 + t2 * nloc};
+ pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts);
+ }
+
+ if (t1 == p - 1) {
+ int pid = 2 * p * p + t0 * p + t2;
+ int array_of_subsizes[3] = {nloc, 1, nloc};
+ int array_of_starts[3] = {1 + t0 * nloc, N, 1 + t2 * nloc};
+ pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts);
+ }
+ if (t1 == 0) {
+ int pid = 3 * p * p + t0 * p + t2;
+ int array_of_subsizes[3] = {nloc, 1, nloc};
+ int array_of_starts[3] = {1 + t0 * nloc, 1, 1 + t2 * nloc};
+ pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts);
+ }
+ if (t2 == p - 1) {
+ int pid = 4 * p * p + t0 * p + t1;
+ int array_of_subsizes[3] = {nloc, nloc, 1};
+ int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, N};
+ pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts);
+ }
+ if (t2 == 0) {
+ int pid = 5 * p * p + t0 * p + t1;
+ int array_of_subsizes[3] = {nloc, nloc, 1};
+ int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, 1};
+ pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts);
+ }
+}
+
+void WaveEquation::unpack_all() {
+ int array_of_sizes[3] = {N + 2, N + 2, N + 2};
+ int nloc = N / threads_per_dim;
+ int p = threads_per_dim;
+ int tid = omp_get_thread_num();
+ int t0, t1, t2;
+ thread_coordinates(tid, threads_per_dim, t0, t1, t2);
+
+ if (t0 == 0) {
+ int pid = 0 * p * p + t1 * p + t2;
+ int array_of_subsizes[3] = {1, nloc, nloc};
+ int array_of_starts[3] = {0, 1 + t1 * nloc, 1 + t2 * nloc};
+ unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts);
+ }
+ if (t0 == p - 1) {
+ int pid = 1 * p * p + t1 * p + t2;
+ int array_of_subsizes[3] = {1, nloc, nloc};
+ int array_of_starts[3] = {N + 1, 1 + t1 * nloc, 1 + t2 * nloc};
+ unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts);
+ }
+ if (t1 == 0) {
+ int pid = 2 * p * p + t0 * p + t2;
+ int array_of_subsizes[3] = {nloc, 1, nloc};
+ int array_of_starts[3] = {1 + t0 * nloc, 0, 1 + t2 * nloc};
+ unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts);
+ }
+ if (t1 == p - 1) {
+ int pid = 3 * p * p + t0 * p + t2;
+ int array_of_subsizes[3] = {nloc, 1, nloc};
+ int array_of_starts[3] = {1 + t0 * nloc, N + 1, 1 + t2 * nloc};
+ unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts);
+ }
+ if (t2 == 0) {
+ int pid = 4 * p * p + t0 * p + t1;
+ int array_of_subsizes[3] = {nloc, nloc, 1};
+ int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, 0};
+ unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts);
+ }
+ if (t2 == p - 1) {
+ int pid = 5 * p * p + t0 * p + t1;
+ int array_of_subsizes[3] = {nloc, nloc, 1};
+ int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, N + 1};
+ unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts);
+ }
+}
+
+void WaveEquation::run(double t_end) {
+ t = 0;
+
+ //as described above, we need 6p^2 packs and 6p^2 unpacks
+ int n = N * N / threads_per_dim / threads_per_dim;
+ for (int i = 0; i < 6 * threads_per_dim * threads_per_dim; i++) {
+ pack[i] = new double[n];
+ unpack[i] = new double[n];
+ }
+
+ double total_time = 0;
+
+ int count = 0;
+ MPI_Barrier(cart_comm);
+ double t0 = MPI_Wtime();
+
+
+//the only parallel region used startas here
+#pragma omp parallel
+ {
+
+ int nloc = N / threads_per_dim;
+ int p = threads_per_dim;
+ int tid = omp_get_thread_num();
+ int t0, t1, t2;
+ thread_coordinates(tid, threads_per_dim, t0, t1, t2);
+ int pid;
+
+ //each thread finds the loop ranges it need only once, before starting the time iterations
+ int i0_min = t0 * nloc;
+ int i1_min = t1 * nloc;
+ int i2_min = t2 * nloc;
+ int i0_max = i0_min + nloc;
+ int i1_max = i1_min + nloc;
+ int i2_max = i2_min + nloc;
+
+ do {
+
+ //have only the master thread print screen output
+ #pragma omp master
+ {
+ if (count % 10 == 0) {
+ if (rank == 0)
+ std::cout << count << " t=" << t << "\n";
+ //Print(count);
+ }
+ }
+
+
+ //Calling a function inside the parallel region means that each thread will enter the
+ //function
+ pack_all();
+
+
+ //Here we declare an array of requests that is local to each thread.
+ //This allows threads to send/receive messages independant of one another.
+ std::vector local_request;
+
+
+ if (t0 == 0) {
+ local_request.resize(local_request.size() + 2);
+ pid = 0 * p * p + t1 * p + t2;
+ MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[0], pid,
+ cart_comm, &local_request[local_request.size() - 2]);
+
+ pid = 1 * p * p + t1 * p + t2;
+ MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[0], pid,
+ cart_comm, &local_request[local_request.size() - 1]);
+ }
+ if (t0 == p - 1) {
+ local_request.resize(local_request.size() + 2);
+
+ pid = 1 * p * p + t1 * p + t2;
+ MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[0], pid,
+ cart_comm, &local_request[local_request.size() - 2]);
+
+ pid = 0 * p * p + t1 * p + t2;
+ MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[0], pid,
+ cart_comm, &local_request[local_request.size() - 1]);
+ }
+ if (t1 == 0) {
+ local_request.resize(local_request.size() + 2);
+
+ pid = 2 * p * p + t0 * p + t2;
+ MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[1], pid,
+ cart_comm, &local_request[local_request.size() - 2]);
+
+ pid = 3 * p * p + t0 * p + t2;
+ MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[1], pid,
+ cart_comm, &local_request[local_request.size() - 1]);
+ }
+ if (t1 == p - 1) {
+ local_request.resize(local_request.size() + 2);
+
+ pid = 3 * p * p + t0 * p + t2;
+ MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[1], pid,
+ cart_comm, &local_request[local_request.size() - 2]);
+
+ pid = 2 * p * p + t0 * p + t2;
+ MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[1], pid,
+ cart_comm, &local_request[local_request.size() - 1]);
+ }
+ if (t2 == 0) {
+ local_request.resize(local_request.size() + 2);
+
+ pid = 4 * p * p + t0 * p + t1;
+ MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[2], pid,
+ cart_comm, &local_request[local_request.size() - 2]);
+
+ pid = 5 * p * p + t0 * p + t1;
+ MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[2], pid,
+ cart_comm, &local_request[local_request.size() - 1]);
+ }
+ if (t2 == p - 1) {
+ local_request.resize(local_request.size() + 2);
+
+ pid = 5 * p * p + t0 * p + t1;
+ MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[2], pid,
+ cart_comm, &local_request[local_request.size() - 2]);
+
+ pid = 4 * p * p + t0 * p + t1;
+ MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[2], pid,
+ cart_comm, &local_request[local_request.size() - 1]);
+ }
+
+
+ //Now that all the send/receive requests are posted, we proceed with the computation of
+ //inner points, for all threads
+ for (int i0 = 2 + i0_min; i0 < i0_max; i0++)
+ for (int i1 = 2 + i1_min; i1 < i1_max; i1++)
+ for (int i2 = 2 + i2_min; i2 < i2_max; i2++)
+ UpdateGridPoint(i0, i1, i2);
+
+
+ //Threads at the boundary will wait here for communication to complete, others will proceed
+ //because their local copy of local_requests will be empty
+ MPI_Waitall(local_request.size(), local_request.data(),
+ MPI_STATUSES_IGNORE);
+
+ //Now that communication is complete, boundary threads can unpack data
+ unpack_all();
+
+
+ //The following commented out loop would not overlap communication and computation.
+ //It is replaced by the for loop before the MPI_Wait call and by the for loops that follow.
+ //for (int i0 = 1 + i0_min; i0 < i0_max +1 ; i0++)
+ //for (int i1 = 1 + i1_min; i1 < i1_max +1 ; i1++)
+ //for (int i2 = 1 + i2_min; i2 < i2_max +1 ; i2++)
+ // UpdateGridPoint(i0, i1, i2);
+
+
+
+ //Update all boundary points on the planes i0=i0_min+1 and i0=i0_max
+ for (int i1 = 1 + i1_min; i1 < i1_max + 1; i1++)
+ for (int i2 = 1 + i2_min; i2 < i2_max + 1; i2++)
+ {
+ UpdateGridPoint(1 + i0_min, i1, i2);
+ UpdateGridPoint( i0_max, i1, i2);
+ }
+
+ //Update all boundary points on the planes i1=i1_min+1 and i1=i1_max
+ //Note that the i0 range starts at 2+i0_min and ends at i0_max, because the points 1+i0_min
+ //and i0_max were updated in the previous loop
+ for (int i0 = 2 + i0_min; i0 < i0_max ; i0++)
+ for (int i2 = 1 + i2_min; i2 < i2_max + 1 ; i2++)
+ {
+ UpdateGridPoint(i0, 1 + i1_min, i2);
+ UpdateGridPoint(i0, i1_max, i2);
+ }
+
+
+ //Update all boundary points on the planes i2=i2_min+1 and i2=i2_max
+ //Note that the i0 range starts at 2+i0_min and ends at i0_max, because the points 1+i0_min
+ //and i0_max were updated in the previous loop. The same is now true for the range of i1
+ for (int i0 = 2 + i0_min; i0 < i0_max ; i0++)
+ for (int i1 = 2 + i1_min; i1 < i1_max ; i1++)
+ {
+ UpdateGridPoint(i0, i1, 1 + i2_min);
+ UpdateGridPoint(i0, i1, i2_max);
+ }
+
+
+ //Now we need to swap pointers. Before this is done, we make sure that all threads are done
+ //with grid point updates (remember that the threads share memory, so if one of them swaps
+ //the pointers, the others will update the wrong grid points, if they are still updating).
+#pragma omp barrier
+
+ //Here we make sure that only one thread swaps the pointers and updates the time
+ //pragma omp master is also fine
+#pragma omp single
+ {
+ std::swap(u_new, u);
+ std::swap(u_new, u_old);
+ t += dt;
+ count++;
+ }
+ } while (t < t_end);
+ }
+
+ MPI_Barrier(cart_comm);
+ total_time = MPI_Wtime() - t0;
+
+ double total_time_max;
+ MPI_Reduce(&total_time, &total_time_max, 1, MPI_DOUBLE, MPI_MAX, 0,
+ cart_comm);
+ if (rank == 0) {
+ std::cout << "Total time = " << total_time_max << "\n";
+ }
+ double s = 0;
+ double Checksum = 0;
+ for (int k = 1; k <= N; k++)
+ for (int j = 1; j <= N; j++)
+ for (int i = 1; i <= N; i++) {
+ int m = k + j * (N + 2) + i * (N + 2) * (N + 2);
+ s += u[m] * u[m];
+ }
+
+ MPI_Reduce(&s, &Checksum, 1, MPI_DOUBLE, MPI_SUM, 0, cart_comm);
+ if (rank == 0)
+ std::cout << "Checksum = " << Checksum << "\n";
+
+ for (int i = 6 * threads_per_dim * threads_per_dim - 1; i >= 0; i--) {
+ delete[] pack[i];
+ delete[] unpack[i];
+ }
+}
\ No newline at end of file
diff --git a/hw3/solution/wave.h b/hw3/solution/wave.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e1e5181df65e0a846f2a0b028b209962752cd5b
--- /dev/null
+++ b/hw3/solution/wave.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+//#define USE_HDF5 // comment this line if you do not want hdf5 file output
+ // (suitable for visualization with Paraview)
+
+#ifdef USE_HDF5
+#include
+#endif
+
+#define L 1.0 // domain size (unit cube) with Ntot x Ntot x Ntot grid points
+
+struct WaveEquation {
+ int N; // grid points per direction for this rank
+ double h; // grid spacing (dx = dy = dz = h)
+ double dt; // timestep
+ double t; // current time
+ double *u; // solution vector
+ double *u_old;
+ double *u_new;
+
+ int Ntot; // total grid points per direction
+ int procs_per_dim; // ranks per direction
+
+ int nthreads;
+ int threads_per_dim;
+
+ int size; // =total ranks = procs_per_dim*procs_per_dim*procs_per_dim
+ int rank;
+
+ int rank_plus[3]; // neighboring ranks
+ int rank_minus[3]; //
+
+ MPI_Comm cart_comm;
+
+ double origin[3]; // physical space (x0,x1,x2) coordinates of 1st grid point
+ // of this rank
+ int coords[3]; // index space coordinates of this rank
+
+ double aux;
+
+ ~WaveEquation();
+ WaveEquation(int a_N, int a_procs_per_dim);
+ void FindCoordinates();
+ double Initial_Condition(double x0, double x1, double x2);
+ void UpdateGridPoint(int i0, int i1, int i2);
+ void Print(int kt);
+ void pack_face(double *pack, int array_of_sizes[3], int array_of_subsizes[3],
+ int array_of_starts[3]);
+ void unpack_face(double *pack, int array_of_sizes[3],
+ int array_of_subsizes[3], int array_of_starts[3]);
+ void run(double t_end);
+
+ double **pack;
+ double **unpack;
+
+ void pack_all();
+ void unpack_all();
+
+ void thread_coordinates(int tid, int threads_per_dim, int &ti0, int &ti1,
+ int &ti2);
+};
diff --git a/hw4/grade.py b/hw4/grade.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b0a7439db2814a2a07e0a01c83c2df99c5d5591
--- /dev/null
+++ b/hw4/grade.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# File : grade.py
+# Description: Generate grading submission file
+# Copyright 2020 ETH Zurich. All Rights Reserved.
+
+'''
+Example:
+
+python grade.py -q1 7 -c1 "This is why I scored 7 points" -q2 20 -q3 15 -c3 "Second comment here" -c3 "More comments"
+'''
+
+#Modify this, according to a given homework
+exercise_conf = {
+ 'Name' : 'Homework 4',
+ 'Questions' : {
+ 'Question 1': {'Total Points': 25},
+ 'Question 2': {'Total Points': 25},
+ 'Question 3': {'Total Points': 20}
+ }
+ }
+
+
+
+
+'''
+==========================================
+Do not modify anything below this comment
+==========================================
+'''
+
+
+import argparse
+import datetime
+import sys
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+
+ for i in range(1, len(exercise_conf['Questions'])+1, 1):
+ parser.add_argument('-q{:d}'.format(i),'--question{:d}'.format(i),
+ type=int, default=0,
+ help='Scored points for Question {:d}'.format(i))
+ parser.add_argument('-c{:d}'.format(i),'--comment{:d}'.format(i),
+ type=str, action='append', nargs='*',
+ help='Comments for Question {:d} (you can add multiple comments)'.format(i))
+
+ return vars(parser.parse_args())
+
+if __name__ == "__main__":
+ args = parse_args()
+
+ grade = lambda s,m: 2.0 + (6.0-2.0) * float(s)/m
+ summary = {}
+ score = 0
+ maxpoints = 0
+ header = '{name:s}: {date:s}\n'.format(
+ name = exercise_conf['Name'], date = str(datetime.datetime.now()))
+ width = len(header.rstrip())
+ summary[0] = [header]
+ for i in range(1, len(exercise_conf['Questions'])+1, 1):
+ content = []
+ qscore = args['question{:d}'.format(i)]
+ qmax = exercise_conf['Questions']['Question {:d}'.format(i)]['Total Points']
+ qscore = max(0 , min(qscore, qmax))
+ content.append( 'Question {id:d}: {score:d}/{max:d}\n'.format(
+ id = i, score = qscore, max = qmax)
+ )
+ comments = args['comment{:d}'.format(i)]
+ if comments is not None:
+ for j,comment in enumerate([s for x in comments for s in x]):
+ content.append( ' -Comment {id:d}: {issue:s}\n'.format(
+ id = j+1, issue = comment.strip())
+ )
+ for line in content:
+ width = width if len(line.rstrip()) 0
+ with open('grade.txt', 'w') as out:
+ out.write(width*'*'+'\n')
+ for lines in summary.values():
+ for line in lines:
+ out.write(line)
+ out.write(width*'*'+'\n')
+ out.write('Grade: {:.2f}'.format(grade(score, maxpoints)))
diff --git a/hw4/skeleton/.gitignore b/hw4/skeleton/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..72fbd9588167063772fae728395b8c0f0be1e188
--- /dev/null
+++ b/hw4/skeleton/.gitignore
@@ -0,0 +1,8 @@
+ssa
+cmaes
+*.o
+*.txt
+*.swp
+*.orig
+
+_korali_result
diff --git a/hw4/skeleton/p1_p2/Makefile b/hw4/skeleton/p1_p2/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..af58a73170ed8aec6d6329100f1db425cd498629
--- /dev/null
+++ b/hw4/skeleton/p1_p2/Makefile
@@ -0,0 +1,33 @@
+CC=g++
+LD=$(CC)
+
+KORALICFLAGS=`python3 -m korali.cxx --cflags`
+KORALILIBS=`python3 -m korali.cxx --libs`
+
+CFLAGS = -Wall -Wfatal-errors -std=c++14 -I ./includes/
+CFLAGS += -O3
+CFLAGS += -fopenmp -D _OPENMP
+
+OBJECTS = main_ssa.o SSA_CPU.o
+
+.DEFAULT: all
+
+all: ssa cmaes
+
+ssa: $(OBJECTS) SSA_CPU.cpp ./includes/SSA_CPU.hpp
+ $(CC) $(CFLAGS) -c SSA_CPU.cpp -o SSA_CPU.o
+ $(LD) $(OBJECTS) -o ssa $(CFLAGS)
+
+cmaes: $(OBJECTS) main_cmaes.cpp ./includes/objective.hpp
+ $(CC) -c main_cmaes.cpp -o main_cmaes.o $(KORALICFLAGS) $(KORALILIBS) $(CFLAGS)
+ $(LD) main_cmaes.o SSA_CPU.o -o cmaes $(KORALICFLAGS) $(KORALILIBS) $(CFLAGS)
+
+%.o: %.cpp
+ $(CC) $(CFLAGS) -c $^ -o $@
+
+clean:
+ rm -f *.o
+ rm -f ssa
+ rm -f cmaes
+
+
diff --git a/hw4/skeleton/p1_p2/README b/hw4/skeleton/p1_p2/README
new file mode 100644
index 0000000000000000000000000000000000000000..4a7bc2844493889f9be664e60443b22416aefb7c
--- /dev/null
+++ b/hw4/skeleton/p1_p2/README
@@ -0,0 +1,52 @@
+HOW TO RUN
+----------
+To compile and run the code on Euler remove all previously loaded nodules (purge) and load the following modules:
+
+module purge
+module load new
+module load gcc/6.3.0
+module load python/3.7.1
+
+or run `source modules.src`
+Make sure that you loaded this files during compilation of Korali.
+
+
+Request interactive shell
+-------------------------
+
+bsub -n 4 -R -W 04:00 -Is bash
+
+and compile with `make`, respectively `make ssa` or `make cmaes` for partial builds.
+You may also run with more or less nodes.
+
+Run
+-------------------------
+
+export OMP_NUM_THREADS=4
+./ssa
+./cmaes
+
+You may want to set the OMP_NUM_THREADS variables to different values.
+
+If interested feel free to use input arguments and study the behaviour of the system for different inputs, e.g.
+./ssa -omega 1 -samples 5000
+Lower quantities usually increase the stochastic effects and require more simulations/samples.
+
+
+WHAT TO MODIFY
+--------------
+You only need to change the indicated sections in
+- SSA_CPU.cpp
+- includes/objective.hpp
+
+and
+- main_cmaes.cpp
+if you wish to modify the population size of CMA-ES.
+
+
+CODE OUTPUT
+-----------
+When the ./ssa code runs, it prints
+
+the averaged time for 1 SSA simulation, the FLOPs, Byte Transfers, and the performance.
+Note that the latter values depend on your implementation of SSA::getTransfers() and SSA::getFlops() and are wrong estimates if not (correctly) implemented.
diff --git a/hw4/skeleton/p1_p2/SSA_CPU.cpp b/hw4/skeleton/p1_p2/SSA_CPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69940c15666bbc35c3057e7af19f083a43036781
--- /dev/null
+++ b/hw4/skeleton/p1_p2/SSA_CPU.cpp
@@ -0,0 +1,213 @@
+#include "SSA_CPU.hpp"
+#ifdef _OPENMP
+#include
+#endif
+#include
+
+void SSA_CPU::operator()()
+{
+ // number of reactions
+ const int m = 4;
+ // number of species
+ const int n = 2;
+ // initial conditions
+ const int S0[n] = {4*omega,0};
+
+ const int niters = static_cast(tend*1000);
+
+ double * const r48 = new double[2*niters*numSamples];
+ double * const curT = new double[numSamples];
+ double * const x0 = new double[numSamples];
+ double * const x1 = new double[numSamples];
+
+ // NUMA aware initialization (first touch)
+#ifdef _OPENMP
+ #pragma omp parallel for
+#endif
+ for (int s=0; s0 && bNotDone)
+ {
+ time = curT[s];
+ Sa = x0[s];
+ Sb = x1[s];
+ }
+ else
+ {
+ time = 0.0;
+ Sa = S0[0];
+ Sb = S0[1];
+ }
+ // propensities
+ double a[m];
+
+ // time stepping
+ int iter = 0;
+ while (time <= tend && iter(time / bin_dt); // 1 FLOP
+ trajS1L[ib+thread_no*nbins] += Sa;
+ trajS2L[ib+thread_no*nbins] += Sb; // 2 FLOP, 2 WRITE
+ ++ntrajL[ib+thread_no*nbins]; // 1 WRITE
+
+ // TODO: Task 1a) (STEP 0)
+ // - compute propensities a[0], a[1], .., a[3] and a0
+ // - use values Sa and Sb, and values stored in k[4], check initialization in SSA_CPU.hpp
+
+ a[0] = 0.0;
+ a[1] = 0.0;
+ a[2] = 0.0;
+ a[3] = 0.0;
+
+ double a0 = 0.0;
+
+ // TODO: Task 1a) (STEP 1)
+ // - sample tau using the inverse sampling method and increment time, use uniform random numbers initialized in r48
+
+ time += 0.1; // 0.1 is a dummy
+
+ // TODO: Task 1a) (STEP 2)
+ // - sample a reaction, use uniform random numbers initialized in r48
+
+ // TODO: Task 1a) (STEP 3)
+ // - increment Sa, Sb
+
+ Sa += 0;
+ Sb += 0;
+
+ iter++;
+ }
+
+ curT[s] = time;
+ x0[s] = Sa;
+ x1[s] = Sb;
+
+ bNotDone = time <= tend && Sa!=0 && Sb!=0;
+ }
+
+ for(int t = 0; t < num_threads; ++t)
+ {
+ for (int i = 0; i < nbins; ++i) {
+ trajS1[i] += trajS1L[i+t*nbins];
+ trajS2[i] += trajS2L[i+t*nbins];
+ ntraj[i] += ntrajL[i+t*nbins]; // bins * (3 FLOP, 3 READ, 3 WRITE) (assuming trajS1L, trajS2L, ntrajL) in cache
+ }
+ }
+
+ delete[] ntrajL;
+ delete[] trajS2L;
+ delete[] trajS1L;
+ stopTiming();
+
+ pass++;
+ }
+
+ delete[] x1;
+ delete[] x0;
+ delete[] curT;
+ delete[] r48;
+
+ normalize_bins();
+}
+
+void SSA_CPU::normalize_bins()
+{
+ assert( trajS2.size() == trajS1.size() );
+ assert( ntraj.size() == trajS1.size() );
+ const int nbins = trajS1.size();
+
+#ifdef _OPENMP
+ #pragma omp parallel for
+#endif
+ for(int i=0; i < nbins; ++i)
+ {
+ trajS1[i]/=ntraj[i];
+ trajS2[i]/=ntraj[i]; // 2 FLOP, 3 READ, 2 WRITE
+ }
+}
+
+double SSA_CPU::getTransfers() const
+{
+ // TODO: (Optional) Task 1c)
+ // - return number of read writes in [BYTES]
+
+ return 1.0;
+}
+
+double SSA_CPU::getFlops() const
+{
+ // TODO: (Optional) Task 1c)
+ // - return number of floating point operations
+ return 1.0;
+}
diff --git a/hw4/skeleton/p1_p2/includes/ArgumentParser.hpp b/hw4/skeleton/p1_p2/includes/ArgumentParser.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f16f84f23951b30d7622f1a3a03b3bcbfbe0e673
--- /dev/null
+++ b/hw4/skeleton/p1_p2/includes/ArgumentParser.hpp
@@ -0,0 +1,140 @@
+#pragma once
+
+#include
+#include