diff --git a/hw3/grade.py b/hw3/grade.py index 6eab42f01b87e3a141ae23fbc585897ee9ae7cab..264d4f7ff109e1642cbfc462716a0ded548b0c71 100644 --- a/hw3/grade.py +++ b/hw3/grade.py @@ -14,8 +14,9 @@ exercise_conf = { 'Name' : 'Homework 3', 'Questions' : { 'Question 1': {'Total Points': 20}, - 'Question 2': {'Total Points': 20}, - 'Question 3': {'Total Points': 20} + 'Question 2': {'Total Points': 40}, + 'Question 3': {'Total Points': 20}, + 'Question 4': {'Total Points': 20} } } diff --git a/hw3/solution/Makefile b/hw3/solution/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..3e68d0337b606d2082eb8cedfe992d317b7d8ead --- /dev/null +++ b/hw3/solution/Makefile @@ -0,0 +1,25 @@ +CXX=mpicxx #h5pcc +CXXFLAGS = -Wpedantic -Wall -Wextra -std=c++11 -lstdc++ -g -O3 -fopenmp + +all: main +%.o: %.cpp + $(CXX) -c -o $@ $< $(CXXFLAGS) +clean: + rm -rf *.o *.xmf *h5 *txt main +main: main.o auxiliary.o wave.o + $(CXX) $(CXXFLAGS)-I. -o main $^ + +run_mpi: + make clean ; make ; mpirun -n 64 ./main 512 4 0.25 + +run_hybrid_v1: + export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 --map-by ppr:3:node ./main 512 2 0.25 + +run_hybrid_v2: + export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 --map-by ppr:2:node ./main 512 2 0.25 + +develop: + export OMP_NUM_THREADS=8 ; make clean ; make ; mpirun -n 8 ./main 256 2 0.1 + + +.PHONY: all clean run_mpi run_hybrid_v1 run_hybrid_v2 develop diff --git a/hw3/solution/auxiliary.cpp b/hw3/solution/auxiliary.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2bc90da9f5026cdddbd12bc9c44fb8be1630798a --- /dev/null +++ b/hw3/solution/auxiliary.cpp @@ -0,0 +1,317 @@ +#include "wave.h" + +WaveEquation::WaveEquation(int a_N, int a_procs_per_dim) { + MPI_Comm_size(MPI_COMM_WORLD, &size); + procs_per_dim = a_procs_per_dim; + Ntot = a_N; + h = L / a_N; + N = a_N / procs_per_dim; + + // the chosen numerical method is stable if dt <= h/sqrt(3) + dt = h / sqrt(3.0); + aux = dt * dt / h / h; + + nthreads = omp_get_max_threads(); + threads_per_dim = pow(nthreads, 1. / 3.); + + assert(N % threads_per_dim == 0); + + FindCoordinates(); + + origin[0] = coords[0] * N * h; + origin[1] = coords[1] * N * h; + origin[2] = coords[2] * N * h; + u = new double[(N + 2) * (N + 2) * (N + 2)]; + + u_old = new double[(N + 2) * (N + 2) * (N + 2)]; + u_new = new double[(N + 2) * (N + 2) * (N + 2)]; + + pack = new double *[6 * threads_per_dim * threads_per_dim]; + unpack = new double *[6 * threads_per_dim * threads_per_dim]; + +#pragma omp parallel + { + int tid = omp_get_thread_num(); + int ti0, ti1, ti2; + thread_coordinates(tid, threads_per_dim, ti0, ti1, ti2); + + int nloc = N / threads_per_dim; + + int i0_min = ti0 * nloc; + int i1_min = ti1 * nloc; + int i2_min = ti2 * nloc; + + for (int i0 = i0_min; i0 < i0_min + nloc; i0++) { + double x0 = origin[0] + i0 * h + 0.5 * h; + for (int i1 = i1_min; i1 < i1_min + nloc; i1++) { + double x1 = origin[1] + i1 * h + 0.5 * h; + for (int i2 = i2_min; i2 < i2_min + nloc; i2++) { + double x2 = origin[2] + i2 * h + 0.5 * h; + int m = (i0 + 1) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2 + 1); + u[m] = Initial_Condition(x0, x1, x2); + u_new[m] = u[m]; + u_old[m] = u[m]; + // assuming that u_old = u is equivalent to du/dt(t=0) = 0 + } + } + } + } +} + +WaveEquation::~WaveEquation() { + delete[] u; + delete[] u_old; + delete[] u_new; +} + +void WaveEquation::FindCoordinates() { + int p = procs_per_dim; + + cart_comm = MPI_COMM_WORLD; + MPI_Comm_rank(cart_comm, &rank); + + coords[0] = rank / (p * p); + + coords[1] = (rank - coords[0] * (p * p)) / p; + + coords[2] = (rank - coords[0] * (p * p) - coords[1] * p) % p; + + int coor_0_plus = (coords[0] + 1 + p) % p; + int coor_1_plus = (coords[1] + 1 + p) % p; + int coor_2_plus = (coords[2] + 1 + p) % p; + + int coor_0_minus = (coords[0] - 1 + p) % p; + int coor_1_minus = (coords[1] - 1 + p) % p; + int coor_2_minus = (coords[2] - 1 + p) % p; + + rank_plus[0] = (p * p) * coor_0_plus + coords[1] * p + coords[2]; + rank_plus[1] = coords[0] * p * p + p * coor_1_plus + coords[2]; + rank_plus[2] = coords[0] * p * p + coords[1] * p + coor_2_plus; + + rank_minus[0] = (p * p) * coor_0_minus + coords[1] * p + coords[2]; + rank_minus[1] = coords[0] * p * p + p * coor_1_minus + coords[2]; + rank_minus[2] = coords[0] * p * p + coords[1] * p + coor_2_minus; +} + +void WaveEquation::pack_face(double *pack, int array_of_sizes[3], + int array_of_subsizes[3], int array_of_starts[3]) { + int p0 = array_of_subsizes[0]; + int p1 = array_of_subsizes[1]; + int p2 = array_of_subsizes[2]; + + int n1 = array_of_sizes[1]; + int n2 = array_of_sizes[2]; + + for (int i0 = array_of_starts[0]; i0 < array_of_starts[0] + p0; i0++) + for (int i1 = array_of_starts[1]; i1 < array_of_starts[1] + p1; i1++) + for (int i2 = array_of_starts[2]; i2 < array_of_starts[2] + p2; i2++) { + int i = (i0 - array_of_starts[0]) * p1 * p2 + + (i1 - array_of_starts[1]) * p2 + (i2 - array_of_starts[2]); + + pack[i] = *(u + i0 * n1 * n2 + i1 * n2 + i2); + } +} + +void WaveEquation::unpack_face(double *pack, int array_of_sizes[3], + int array_of_subsizes[3], + int array_of_starts[3]) { + int p0 = array_of_subsizes[0]; + int p1 = array_of_subsizes[1]; + int p2 = array_of_subsizes[2]; + + int n1 = array_of_sizes[1]; + int n2 = array_of_sizes[2]; + + for (int i0 = array_of_starts[0]; i0 < array_of_starts[0] + p0; i0++) + for (int i1 = array_of_starts[1]; i1 < array_of_starts[1] + p1; i1++) + for (int i2 = array_of_starts[2]; i2 < array_of_starts[2] + p2; i2++) { + int i = (i0 - array_of_starts[0]) * p1 * p2 + + (i1 - array_of_starts[1]) * p2 + (i2 - array_of_starts[2]); + + *(u + i0 * n1 * n2 + i1 * n2 + i2) = pack[i]; + } +} + +void WaveEquation::thread_coordinates(int tid, int threads_per_dim, int &ti0, + int &ti1, int &ti2) { + ti0 = tid / (threads_per_dim * threads_per_dim); + ti1 = (tid - ti0 * threads_per_dim * threads_per_dim) / threads_per_dim; + ti2 = tid - ti0 * threads_per_dim * threads_per_dim - ti1 * threads_per_dim; + assert(tid == + ti0 * threads_per_dim * threads_per_dim + ti1 * threads_per_dim + ti2); +} + +double WaveEquation::Initial_Condition(double x0, double x1, double x2) { + double r = (x0 - 0.5) * (x0 - 0.5) + (x1 - 0.5) * (x1 - 0.5) + + (x2 - 0.5) * (x2 - 0.5); + return exp(-r / 0.1); +} + +// do not change this function +void WaveEquation::Print(int kt = 0) { +#ifdef USE_HDF5 + + std::string name = "Wave__" + std::to_string(kt); + if (rank == 0) { + FILE *xmf = 0; + xmf = fopen((name + ".xmf").c_str(), "w"); + + std::stringstream s_head; + + s_head << "\n"; + s_head << "\n"; + s_head << "\n"; + s_head << " \n"; + + s_head << " \n"; + s_tail << "\n"; + + fprintf(xmf, (s_tail.str()).c_str()); + fclose(xmf); + } + + hid_t file_id, dataset_id, fspace_id, fapl_id, mspace_id; + + H5open(); + fapl_id = H5Pcreate(H5P_FILE_ACCESS); + + H5Pset_fapl_mpio(fapl_id, cart_comm, MPI_INFO_NULL); + + file_id = + H5Fcreate((name + ".h5").c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + + H5Pclose(fapl_id); + + double *array_all = new double[N * N * N]; + + hsize_t count[4] = {static_cast(N), + static_cast(N), + static_cast(N), 1}; + hsize_t dims[4] = {static_cast(Ntot), + static_cast(Ntot), + static_cast(Ntot), 1}; + hsize_t offset[4] = {static_cast(coords[0] * N), + static_cast(coords[1] * N), + static_cast(coords[2] * N), 0}; + + for (int i0 = 1; i0 <= N; i0++) { + for (int i1 = 1; i1 <= N; i1++) { + for (int i2 = 1; i2 <= N; i2++) { + array_all[(i2 - 1) + (i1 - 1) * N + (i0 - 1) * N * N] = + u[i2 + i1 * (N + 2) + i0 * (N + 2) * (N + 2)]; + } + } + } + fapl_id = H5Pcreate(H5P_DATASET_XFER); + H5Pset_dxpl_mpio(fapl_id, H5FD_MPIO_COLLECTIVE); + fspace_id = H5Screate_simple(4, dims, NULL); + + dataset_id = H5Dcreate(file_id, "data", H5T_NATIVE_DOUBLE, fspace_id, + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + fspace_id = H5Dget_space(dataset_id); + + H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL); + + mspace_id = H5Screate_simple(4, count, NULL); + + H5Dwrite(dataset_id, H5T_NATIVE_DOUBLE, mspace_id, fspace_id, fapl_id, + array_all); + + H5Sclose(mspace_id); + H5Sclose(fspace_id); + H5Dclose(dataset_id); + H5Pclose(fapl_id); + H5Fclose(file_id); + H5close(); + + delete[] array_all; +#else + + std::string name = "Wave__" + std::to_string(kt) + ".txt"; + MPI_File file; + MPI_File_open(cart_comm, name.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, + MPI_INFO_NULL, &file); + + std::stringstream ss; + + for (int i0 = 0; i0 < N; i0++) { + double x0 = origin[0] + i0 * h + 0.5 * h; + for (int i1 = 0; i1 < N; i1++) { + double x1 = origin[1] + i1 * h + 0.5 * h; + for (int i2 = 0; i2 < N; i2++) { + double x2 = origin[2] + i2 * h + 0.5 * h; + ss << x0 << " " << x1 << " " << x2 << " " + << u[(i2 + 1) + (i1 + 1) * (N + 2) + (i0 + 1) * (N + 2) * (N + 2)] + << "\n"; + } + } + } + + std::string asciiData = ss.str(); + + MPI_Offset my_length = asciiData.size() * sizeof(char); + + MPI_Offset offset; + MPI_Exscan(&my_length, &offset, 1, MPI_OFFSET, MPI_SUM, cart_comm); + + MPI_File_write_at_all(file, offset, asciiData.data(), asciiData.size(), + MPI_CHAR, MPI_STATUS_IGNORE); + MPI_File_close(&file); + +#endif +} + +// do not change this function +void WaveEquation::UpdateGridPoint(int i0, int i1, int i2) { + int m = i0 * (N + 2) * (N + 2) + i1 * (N + 2) + i2; + int ip1 = (i0 + 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2); + int im1 = (i0 - 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2); + int jp1 = (i0) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2); + int jm1 = (i0) * (N + 2) * (N + 2) + (i1 - 1) * (N + 2) + (i2); + int kp1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 + 1); + int km1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 - 1); + u_new[m] = + 2.0 * u[m] - u_old[m] + + aux * (u[ip1] + u[im1] + u[jp1] + u[jm1] + u[kp1] + u[km1] - 6.0 * u[m]); +} \ No newline at end of file diff --git a/hw3/solution/main.cpp b/hw3/solution/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d43863f2ca89e50b4b7c0c2084fa3392460a0518 --- /dev/null +++ b/hw3/solution/main.cpp @@ -0,0 +1,60 @@ +#include "wave.h" + +int main(int argc, char **argv) { + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + if (argc != 4) { + if (rank == 0) { + std::cout << " Incorrect number of inputs. Run as:\n"; + std::cout << " mpirun -n x ./main N y t \n"; + std::cout << " where \n"; + std::cout << " N= number of grid points per direction\n"; + std::cout << " y= number of ranks per direction (=(x)^(1/3))\n"; + std::cout << " t= final time of simulation \n"; + std::cout << " Aborting...\n"; + } + int err = 1; + MPI_Abort(MPI_COMM_WORLD, err); + } + + int points = std::stoi(argv[1]); + int procs_per_dim = std::stoi(argv[2]); + double t_end = std::stof(argv[3]); + + if (size != procs_per_dim * procs_per_dim * procs_per_dim) { + if (rank == 0) + std::cout << " Incorrect number of ranks per direction. Aborting... \n"; + int err = 2; + MPI_Abort(MPI_COMM_WORLD, err); + } + + if (points % procs_per_dim != 0) { + if (rank == 0) + std::cout << " Grid points per direction must be divisible by the number " + "of ranks per direction. Aborting... \n"; + int err = 3; + MPI_Abort(MPI_COMM_WORLD, err); + } + + int nthreads = omp_get_max_threads(); + int threads_per_dim = pow(nthreads, 1.0 / 3.0) + 0.5; + if (nthreads != threads_per_dim * threads_per_dim * threads_per_dim) { + if (rank == 0) + std::cout + << " Number of OPENMP threads must be a cubic number. Aborting... \n"; + int err = 4; + MPI_Abort(MPI_COMM_WORLD, err); + } + + WaveEquation Simulation(points, procs_per_dim); + + Simulation.run(t_end); + + MPI_Finalize(); + return 0; +} diff --git a/hw3/solution/wave.cpp b/hw3/solution/wave.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a3684ec58dce2be7ef9735b5048f1295d23c48b --- /dev/null +++ b/hw3/solution/wave.cpp @@ -0,0 +1,360 @@ +#include "wave.h" + +/* +Solution notes + +Each thread is associated with a set of thread coordinates (t0,t1,t2) and the number of threads +per direction is p (so the total number of threads is p^3). + +A thread lies at the boundary of a process if ti=0 and/or if ti=p-1, for i=0,1,2. + +Threads at the boundary are associated with a unique send/pack id and a unique receive/unpack id. +Those ids are used to properly pack/unpack data and to match message tags when sending/receiving. + + + +Each MPI process is assigned a cubic part of the domain. Thus, there are 6 faces where data needs +to be exchanged. Each face has p^2 threads. This means that there are 6p^2 send/pack ids and +6p^2 unpack/receive ids. + +Logically, those ids correspond to a 3-dimensional array, with sizes [6][p][p]. They are stored in +1-dimensional arrays, so that +[i][j][k] = i * p^2 + j * p + k +i=0,1,...,5 +j=0,1,...,p-1 +k=0,1,...,p-1 + + +In the present solution, send ids correspond to receive ids as follows + +send_id[0][j][k] = receive_id[1][j][k] +send_id[1][j][k] = receive_id[0][j][k] +send_id[2][j][k] = receive_id[3][j][k] +send_id[3][j][k] = receive_id[2][j][k] +send_id[4][j][k] = receive_id[5][j][k] +send_id[5][j][k] = receive_id[4][j][k] +for all j,k=0,...,p-1 + +*/ + + + +void WaveEquation::pack_all() { + int array_of_sizes[3] = {N + 2, N + 2, N + 2}; + int nloc = N / threads_per_dim; + int p = threads_per_dim; + int tid = omp_get_thread_num(); + int t0, t1, t2; + thread_coordinates(tid, threads_per_dim, t0, t1, t2); + + if (t0 == p - 1) { + int pid = 0 * p * p + t1 * p + t2; + int array_of_subsizes[3] = {1, nloc, nloc}; + int array_of_starts[3] = {N, 1 + t1 * nloc, 1 + t2 * nloc}; + pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts); + } + if (t0 == 0) { + int pid = 1 * p * p + t1 * p + t2; + int array_of_subsizes[3] = {1, nloc, nloc}; + int array_of_starts[3] = {1, 1 + t1 * nloc, 1 + t2 * nloc}; + pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts); + } + + if (t1 == p - 1) { + int pid = 2 * p * p + t0 * p + t2; + int array_of_subsizes[3] = {nloc, 1, nloc}; + int array_of_starts[3] = {1 + t0 * nloc, N, 1 + t2 * nloc}; + pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts); + } + if (t1 == 0) { + int pid = 3 * p * p + t0 * p + t2; + int array_of_subsizes[3] = {nloc, 1, nloc}; + int array_of_starts[3] = {1 + t0 * nloc, 1, 1 + t2 * nloc}; + pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts); + } + if (t2 == p - 1) { + int pid = 4 * p * p + t0 * p + t1; + int array_of_subsizes[3] = {nloc, nloc, 1}; + int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, N}; + pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts); + } + if (t2 == 0) { + int pid = 5 * p * p + t0 * p + t1; + int array_of_subsizes[3] = {nloc, nloc, 1}; + int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, 1}; + pack_face(pack[pid], array_of_sizes, array_of_subsizes, array_of_starts); + } +} + +void WaveEquation::unpack_all() { + int array_of_sizes[3] = {N + 2, N + 2, N + 2}; + int nloc = N / threads_per_dim; + int p = threads_per_dim; + int tid = omp_get_thread_num(); + int t0, t1, t2; + thread_coordinates(tid, threads_per_dim, t0, t1, t2); + + if (t0 == 0) { + int pid = 0 * p * p + t1 * p + t2; + int array_of_subsizes[3] = {1, nloc, nloc}; + int array_of_starts[3] = {0, 1 + t1 * nloc, 1 + t2 * nloc}; + unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts); + } + if (t0 == p - 1) { + int pid = 1 * p * p + t1 * p + t2; + int array_of_subsizes[3] = {1, nloc, nloc}; + int array_of_starts[3] = {N + 1, 1 + t1 * nloc, 1 + t2 * nloc}; + unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts); + } + if (t1 == 0) { + int pid = 2 * p * p + t0 * p + t2; + int array_of_subsizes[3] = {nloc, 1, nloc}; + int array_of_starts[3] = {1 + t0 * nloc, 0, 1 + t2 * nloc}; + unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts); + } + if (t1 == p - 1) { + int pid = 3 * p * p + t0 * p + t2; + int array_of_subsizes[3] = {nloc, 1, nloc}; + int array_of_starts[3] = {1 + t0 * nloc, N + 1, 1 + t2 * nloc}; + unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts); + } + if (t2 == 0) { + int pid = 4 * p * p + t0 * p + t1; + int array_of_subsizes[3] = {nloc, nloc, 1}; + int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, 0}; + unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts); + } + if (t2 == p - 1) { + int pid = 5 * p * p + t0 * p + t1; + int array_of_subsizes[3] = {nloc, nloc, 1}; + int array_of_starts[3] = {1 + t0 * nloc, 1 + t1 * nloc, N + 1}; + unpack_face(unpack[pid], array_of_sizes, array_of_subsizes,array_of_starts); + } +} + +void WaveEquation::run(double t_end) { + t = 0; + + //as described above, we need 6p^2 packs and 6p^2 unpacks + int n = N * N / threads_per_dim / threads_per_dim; + for (int i = 0; i < 6 * threads_per_dim * threads_per_dim; i++) { + pack[i] = new double[n]; + unpack[i] = new double[n]; + } + + double total_time = 0; + + int count = 0; + MPI_Barrier(cart_comm); + double t0 = MPI_Wtime(); + + +//the only parallel region used startas here +#pragma omp parallel + { + + int nloc = N / threads_per_dim; + int p = threads_per_dim; + int tid = omp_get_thread_num(); + int t0, t1, t2; + thread_coordinates(tid, threads_per_dim, t0, t1, t2); + int pid; + + //each thread finds the loop ranges it need only once, before starting the time iterations + int i0_min = t0 * nloc; + int i1_min = t1 * nloc; + int i2_min = t2 * nloc; + int i0_max = i0_min + nloc; + int i1_max = i1_min + nloc; + int i2_max = i2_min + nloc; + + do { + + //have only the master thread print screen output + #pragma omp master + { + if (count % 10 == 0) { + if (rank == 0) + std::cout << count << " t=" << t << "\n"; + //Print(count); + } + } + + + //Calling a function inside the parallel region means that each thread will enter the + //function + pack_all(); + + + //Here we declare an array of requests that is local to each thread. + //This allows threads to send/receive messages independant of one another. + std::vector local_request; + + + if (t0 == 0) { + local_request.resize(local_request.size() + 2); + pid = 0 * p * p + t1 * p + t2; + MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[0], pid, + cart_comm, &local_request[local_request.size() - 2]); + + pid = 1 * p * p + t1 * p + t2; + MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[0], pid, + cart_comm, &local_request[local_request.size() - 1]); + } + if (t0 == p - 1) { + local_request.resize(local_request.size() + 2); + + pid = 1 * p * p + t1 * p + t2; + MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[0], pid, + cart_comm, &local_request[local_request.size() - 2]); + + pid = 0 * p * p + t1 * p + t2; + MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[0], pid, + cart_comm, &local_request[local_request.size() - 1]); + } + if (t1 == 0) { + local_request.resize(local_request.size() + 2); + + pid = 2 * p * p + t0 * p + t2; + MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[1], pid, + cart_comm, &local_request[local_request.size() - 2]); + + pid = 3 * p * p + t0 * p + t2; + MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[1], pid, + cart_comm, &local_request[local_request.size() - 1]); + } + if (t1 == p - 1) { + local_request.resize(local_request.size() + 2); + + pid = 3 * p * p + t0 * p + t2; + MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[1], pid, + cart_comm, &local_request[local_request.size() - 2]); + + pid = 2 * p * p + t0 * p + t2; + MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[1], pid, + cart_comm, &local_request[local_request.size() - 1]); + } + if (t2 == 0) { + local_request.resize(local_request.size() + 2); + + pid = 4 * p * p + t0 * p + t1; + MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[2], pid, + cart_comm, &local_request[local_request.size() - 2]); + + pid = 5 * p * p + t0 * p + t1; + MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_minus[2], pid, + cart_comm, &local_request[local_request.size() - 1]); + } + if (t2 == p - 1) { + local_request.resize(local_request.size() + 2); + + pid = 5 * p * p + t0 * p + t1; + MPI_Irecv(unpack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[2], pid, + cart_comm, &local_request[local_request.size() - 2]); + + pid = 4 * p * p + t0 * p + t1; + MPI_Isend(pack[pid], nloc * nloc, MPI_DOUBLE, rank_plus[2], pid, + cart_comm, &local_request[local_request.size() - 1]); + } + + + //Now that all the send/receive requests are posted, we proceed with the computation of + //inner points, for all threads + for (int i0 = 2 + i0_min; i0 < i0_max; i0++) + for (int i1 = 2 + i1_min; i1 < i1_max; i1++) + for (int i2 = 2 + i2_min; i2 < i2_max; i2++) + UpdateGridPoint(i0, i1, i2); + + + //Threads at the boundary will wait here for communication to complete, others will proceed + //because their local copy of local_requests will be empty + MPI_Waitall(local_request.size(), local_request.data(), + MPI_STATUSES_IGNORE); + + //Now that communication is complete, boundary threads can unpack data + unpack_all(); + + + //The following commented out loop would not overlap communication and computation. + //It is replaced by the for loop before the MPI_Wait call and by the for loops that follow. + //for (int i0 = 1 + i0_min; i0 < i0_max +1 ; i0++) + //for (int i1 = 1 + i1_min; i1 < i1_max +1 ; i1++) + //for (int i2 = 1 + i2_min; i2 < i2_max +1 ; i2++) + // UpdateGridPoint(i0, i1, i2); + + + + //Update all boundary points on the planes i0=i0_min+1 and i0=i0_max + for (int i1 = 1 + i1_min; i1 < i1_max + 1; i1++) + for (int i2 = 1 + i2_min; i2 < i2_max + 1; i2++) + { + UpdateGridPoint(1 + i0_min, i1, i2); + UpdateGridPoint( i0_max, i1, i2); + } + + //Update all boundary points on the planes i1=i1_min+1 and i1=i1_max + //Note that the i0 range starts at 2+i0_min and ends at i0_max, because the points 1+i0_min + //and i0_max were updated in the previous loop + for (int i0 = 2 + i0_min; i0 < i0_max ; i0++) + for (int i2 = 1 + i2_min; i2 < i2_max + 1 ; i2++) + { + UpdateGridPoint(i0, 1 + i1_min, i2); + UpdateGridPoint(i0, i1_max, i2); + } + + + //Update all boundary points on the planes i2=i2_min+1 and i2=i2_max + //Note that the i0 range starts at 2+i0_min and ends at i0_max, because the points 1+i0_min + //and i0_max were updated in the previous loop. The same is now true for the range of i1 + for (int i0 = 2 + i0_min; i0 < i0_max ; i0++) + for (int i1 = 2 + i1_min; i1 < i1_max ; i1++) + { + UpdateGridPoint(i0, i1, 1 + i2_min); + UpdateGridPoint(i0, i1, i2_max); + } + + + //Now we need to swap pointers. Before this is done, we make sure that all threads are done + //with grid point updates (remember that the threads share memory, so if one of them swaps + //the pointers, the others will update the wrong grid points, if they are still updating). +#pragma omp barrier + + //Here we make sure that only one thread swaps the pointers and updates the time + //pragma omp master is also fine +#pragma omp single + { + std::swap(u_new, u); + std::swap(u_new, u_old); + t += dt; + count++; + } + } while (t < t_end); + } + + MPI_Barrier(cart_comm); + total_time = MPI_Wtime() - t0; + + double total_time_max; + MPI_Reduce(&total_time, &total_time_max, 1, MPI_DOUBLE, MPI_MAX, 0, + cart_comm); + if (rank == 0) { + std::cout << "Total time = " << total_time_max << "\n"; + } + double s = 0; + double Checksum = 0; + for (int k = 1; k <= N; k++) + for (int j = 1; j <= N; j++) + for (int i = 1; i <= N; i++) { + int m = k + j * (N + 2) + i * (N + 2) * (N + 2); + s += u[m] * u[m]; + } + + MPI_Reduce(&s, &Checksum, 1, MPI_DOUBLE, MPI_SUM, 0, cart_comm); + if (rank == 0) + std::cout << "Checksum = " << Checksum << "\n"; + + for (int i = 6 * threads_per_dim * threads_per_dim - 1; i >= 0; i--) { + delete[] pack[i]; + delete[] unpack[i]; + } +} \ No newline at end of file diff --git a/hw3/solution/wave.h b/hw3/solution/wave.h new file mode 100644 index 0000000000000000000000000000000000000000..8e1e5181df65e0a846f2a0b028b209962752cd5b --- /dev/null +++ b/hw3/solution/wave.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#define USE_HDF5 // comment this line if you do not want hdf5 file output + // (suitable for visualization with Paraview) + +#ifdef USE_HDF5 +#include +#endif + +#define L 1.0 // domain size (unit cube) with Ntot x Ntot x Ntot grid points + +struct WaveEquation { + int N; // grid points per direction for this rank + double h; // grid spacing (dx = dy = dz = h) + double dt; // timestep + double t; // current time + double *u; // solution vector + double *u_old; + double *u_new; + + int Ntot; // total grid points per direction + int procs_per_dim; // ranks per direction + + int nthreads; + int threads_per_dim; + + int size; // =total ranks = procs_per_dim*procs_per_dim*procs_per_dim + int rank; + + int rank_plus[3]; // neighboring ranks + int rank_minus[3]; // + + MPI_Comm cart_comm; + + double origin[3]; // physical space (x0,x1,x2) coordinates of 1st grid point + // of this rank + int coords[3]; // index space coordinates of this rank + + double aux; + + ~WaveEquation(); + WaveEquation(int a_N, int a_procs_per_dim); + void FindCoordinates(); + double Initial_Condition(double x0, double x1, double x2); + void UpdateGridPoint(int i0, int i1, int i2); + void Print(int kt); + void pack_face(double *pack, int array_of_sizes[3], int array_of_subsizes[3], + int array_of_starts[3]); + void unpack_face(double *pack, int array_of_sizes[3], + int array_of_subsizes[3], int array_of_starts[3]); + void run(double t_end); + + double **pack; + double **unpack; + + void pack_all(); + void unpack_all(); + + void thread_coordinates(int tid, int threads_per_dim, int &ti0, int &ti1, + int &ti2); +}; diff --git a/hw4/grade.py b/hw4/grade.py new file mode 100644 index 0000000000000000000000000000000000000000..5b0a7439db2814a2a07e0a01c83c2df99c5d5591 --- /dev/null +++ b/hw4/grade.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# File : grade.py +# Description: Generate grading submission file +# Copyright 2020 ETH Zurich. All Rights Reserved. + +''' +Example: + +python grade.py -q1 7 -c1 "This is why I scored 7 points" -q2 20 -q3 15 -c3 "Second comment here" -c3 "More comments" +''' + +#Modify this, according to a given homework +exercise_conf = { + 'Name' : 'Homework 4', + 'Questions' : { + 'Question 1': {'Total Points': 25}, + 'Question 2': {'Total Points': 25}, + 'Question 3': {'Total Points': 20} + } + } + + + + +''' +========================================== +Do not modify anything below this comment +========================================== +''' + + +import argparse +import datetime +import sys + +def parse_args(): + parser = argparse.ArgumentParser() + + for i in range(1, len(exercise_conf['Questions'])+1, 1): + parser.add_argument('-q{:d}'.format(i),'--question{:d}'.format(i), + type=int, default=0, + help='Scored points for Question {:d}'.format(i)) + parser.add_argument('-c{:d}'.format(i),'--comment{:d}'.format(i), + type=str, action='append', nargs='*', + help='Comments for Question {:d} (you can add multiple comments)'.format(i)) + + return vars(parser.parse_args()) + +if __name__ == "__main__": + args = parse_args() + + grade = lambda s,m: 2.0 + (6.0-2.0) * float(s)/m + summary = {} + score = 0 + maxpoints = 0 + header = '{name:s}: {date:s}\n'.format( + name = exercise_conf['Name'], date = str(datetime.datetime.now())) + width = len(header.rstrip()) + summary[0] = [header] + for i in range(1, len(exercise_conf['Questions'])+1, 1): + content = [] + qscore = args['question{:d}'.format(i)] + qmax = exercise_conf['Questions']['Question {:d}'.format(i)]['Total Points'] + qscore = max(0 , min(qscore, qmax)) + content.append( 'Question {id:d}: {score:d}/{max:d}\n'.format( + id = i, score = qscore, max = qmax) + ) + comments = args['comment{:d}'.format(i)] + if comments is not None: + for j,comment in enumerate([s for x in comments for s in x]): + content.append( ' -Comment {id:d}: {issue:s}\n'.format( + id = j+1, issue = comment.strip()) + ) + for line in content: + width = width if len(line.rstrip()) 0 + with open('grade.txt', 'w') as out: + out.write(width*'*'+'\n') + for lines in summary.values(): + for line in lines: + out.write(line) + out.write(width*'*'+'\n') + out.write('Grade: {:.2f}'.format(grade(score, maxpoints))) diff --git a/hw4/skeleton/.gitignore b/hw4/skeleton/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..72fbd9588167063772fae728395b8c0f0be1e188 --- /dev/null +++ b/hw4/skeleton/.gitignore @@ -0,0 +1,8 @@ +ssa +cmaes +*.o +*.txt +*.swp +*.orig + +_korali_result diff --git a/hw4/skeleton/p1_p2/Makefile b/hw4/skeleton/p1_p2/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..af58a73170ed8aec6d6329100f1db425cd498629 --- /dev/null +++ b/hw4/skeleton/p1_p2/Makefile @@ -0,0 +1,33 @@ +CC=g++ +LD=$(CC) + +KORALICFLAGS=`python3 -m korali.cxx --cflags` +KORALILIBS=`python3 -m korali.cxx --libs` + +CFLAGS = -Wall -Wfatal-errors -std=c++14 -I ./includes/ +CFLAGS += -O3 +CFLAGS += -fopenmp -D _OPENMP + +OBJECTS = main_ssa.o SSA_CPU.o + +.DEFAULT: all + +all: ssa cmaes + +ssa: $(OBJECTS) SSA_CPU.cpp ./includes/SSA_CPU.hpp + $(CC) $(CFLAGS) -c SSA_CPU.cpp -o SSA_CPU.o + $(LD) $(OBJECTS) -o ssa $(CFLAGS) + +cmaes: $(OBJECTS) main_cmaes.cpp ./includes/objective.hpp + $(CC) -c main_cmaes.cpp -o main_cmaes.o $(KORALICFLAGS) $(KORALILIBS) $(CFLAGS) + $(LD) main_cmaes.o SSA_CPU.o -o cmaes $(KORALICFLAGS) $(KORALILIBS) $(CFLAGS) + +%.o: %.cpp + $(CC) $(CFLAGS) -c $^ -o $@ + +clean: + rm -f *.o + rm -f ssa + rm -f cmaes + + diff --git a/hw4/skeleton/p1_p2/README b/hw4/skeleton/p1_p2/README new file mode 100644 index 0000000000000000000000000000000000000000..4a7bc2844493889f9be664e60443b22416aefb7c --- /dev/null +++ b/hw4/skeleton/p1_p2/README @@ -0,0 +1,52 @@ +HOW TO RUN +---------- +To compile and run the code on Euler remove all previously loaded nodules (purge) and load the following modules: + +module purge +module load new +module load gcc/6.3.0 +module load python/3.7.1 + +or run `source modules.src` +Make sure that you loaded this files during compilation of Korali. + + +Request interactive shell +------------------------- + +bsub -n 4 -R -W 04:00 -Is bash + +and compile with `make`, respectively `make ssa` or `make cmaes` for partial builds. +You may also run with more or less nodes. + +Run +------------------------- + +export OMP_NUM_THREADS=4 +./ssa +./cmaes + +You may want to set the OMP_NUM_THREADS variables to different values. + +If interested feel free to use input arguments and study the behaviour of the system for different inputs, e.g. +./ssa -omega 1 -samples 5000 +Lower quantities usually increase the stochastic effects and require more simulations/samples. + + +WHAT TO MODIFY +-------------- +You only need to change the indicated sections in +- SSA_CPU.cpp +- includes/objective.hpp + +and +- main_cmaes.cpp +if you wish to modify the population size of CMA-ES. + + +CODE OUTPUT +----------- +When the ./ssa code runs, it prints + +the averaged time for 1 SSA simulation, the FLOPs, Byte Transfers, and the performance. +Note that the latter values depend on your implementation of SSA::getTransfers() and SSA::getFlops() and are wrong estimates if not (correctly) implemented. diff --git a/hw4/skeleton/p1_p2/SSA_CPU.cpp b/hw4/skeleton/p1_p2/SSA_CPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..69940c15666bbc35c3057e7af19f083a43036781 --- /dev/null +++ b/hw4/skeleton/p1_p2/SSA_CPU.cpp @@ -0,0 +1,213 @@ +#include "SSA_CPU.hpp" +#ifdef _OPENMP +#include +#endif +#include + +void SSA_CPU::operator()() +{ + // number of reactions + const int m = 4; + // number of species + const int n = 2; + // initial conditions + const int S0[n] = {4*omega,0}; + + const int niters = static_cast(tend*1000); + + double * const r48 = new double[2*niters*numSamples]; + double * const curT = new double[numSamples]; + double * const x0 = new double[numSamples]; + double * const x1 = new double[numSamples]; + + // NUMA aware initialization (first touch) +#ifdef _OPENMP + #pragma omp parallel for +#endif + for (int s=0; s0 && bNotDone) + { + time = curT[s]; + Sa = x0[s]; + Sb = x1[s]; + } + else + { + time = 0.0; + Sa = S0[0]; + Sb = S0[1]; + } + // propensities + double a[m]; + + // time stepping + int iter = 0; + while (time <= tend && iter(time / bin_dt); // 1 FLOP + trajS1L[ib+thread_no*nbins] += Sa; + trajS2L[ib+thread_no*nbins] += Sb; // 2 FLOP, 2 WRITE + ++ntrajL[ib+thread_no*nbins]; // 1 WRITE + + // TODO: Task 1a) (STEP 0) + // - compute propensities a[0], a[1], .., a[3] and a0 + // - use values Sa and Sb, and values stored in k[4], check initialization in SSA_CPU.hpp + + a[0] = 0.0; + a[1] = 0.0; + a[2] = 0.0; + a[3] = 0.0; + + double a0 = 0.0; + + // TODO: Task 1a) (STEP 1) + // - sample tau using the inverse sampling method and increment time, use uniform random numbers initialized in r48 + + time += 0.1; // 0.1 is a dummy + + // TODO: Task 1a) (STEP 2) + // - sample a reaction, use uniform random numbers initialized in r48 + + // TODO: Task 1a) (STEP 3) + // - increment Sa, Sb + + Sa += 0; + Sb += 0; + + iter++; + } + + curT[s] = time; + x0[s] = Sa; + x1[s] = Sb; + + bNotDone = time <= tend && Sa!=0 && Sb!=0; + } + + for(int t = 0; t < num_threads; ++t) + { + for (int i = 0; i < nbins; ++i) { + trajS1[i] += trajS1L[i+t*nbins]; + trajS2[i] += trajS2L[i+t*nbins]; + ntraj[i] += ntrajL[i+t*nbins]; // bins * (3 FLOP, 3 READ, 3 WRITE) (assuming trajS1L, trajS2L, ntrajL) in cache + } + } + + delete[] ntrajL; + delete[] trajS2L; + delete[] trajS1L; + stopTiming(); + + pass++; + } + + delete[] x1; + delete[] x0; + delete[] curT; + delete[] r48; + + normalize_bins(); +} + +void SSA_CPU::normalize_bins() +{ + assert( trajS2.size() == trajS1.size() ); + assert( ntraj.size() == trajS1.size() ); + const int nbins = trajS1.size(); + +#ifdef _OPENMP + #pragma omp parallel for +#endif + for(int i=0; i < nbins; ++i) + { + trajS1[i]/=ntraj[i]; + trajS2[i]/=ntraj[i]; // 2 FLOP, 3 READ, 2 WRITE + } +} + +double SSA_CPU::getTransfers() const +{ + // TODO: (Optional) Task 1c) + // - return number of read writes in [BYTES] + + return 1.0; +} + +double SSA_CPU::getFlops() const +{ + // TODO: (Optional) Task 1c) + // - return number of floating point operations + return 1.0; +} diff --git a/hw4/skeleton/p1_p2/includes/ArgumentParser.hpp b/hw4/skeleton/p1_p2/includes/ArgumentParser.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f16f84f23951b30d7622f1a3a03b3bcbfbe0e673 --- /dev/null +++ b/hw4/skeleton/p1_p2/includes/ArgumentParser.hpp @@ -0,0 +1,140 @@ +#pragma once + +#include +#include +#include +#include + +using namespace std; + +class Value +{ +private: + string content; + +public: + + Value() : content("") {} + + Value(string content_) : content(content_) { } + + double asDouble(double def=0) const + { + if (content == "") return def; + return (double) atof(content.c_str()); + } + + int asInt(int def=0) const + { + if (content == "") return def; + return atoi(content.c_str()); + } + + bool asBool(bool def=false) const + { + if (content == "") return def; + if (content == "0") return false; + if (content == "false") return false; + + return true; + } + + string asString(string def="") const + { + if (content == "") return def; + + return content; + } +}; + +class ArgumentParser +{ +private: + + map mapArguments; + + const int iArgC; + const char** vArgV; + bool bStrictMode; + +public: + + Value operator()(const string arg) + { + if (bStrictMode) + { + map::const_iterator it = mapArguments.find(arg); + + if (it == mapArguments.end()) + { + printf("Runtime option NOT SPECIFIED! ABORTING! name: %s\n",arg.data()); + abort(); + } + } + + return mapArguments[arg]; + } + + ArgumentParser(const int argc, const char ** argv) : mapArguments(), iArgC(argc), vArgV(argv), bStrictMode(false) + { + for (int i=1; i::const_iterator it=mapArguments.begin(); it!=mapArguments.end(); it++) + { + options+= it->first + " " + it->second.asString() + " "; + } + string filepath = (path + "/" + string("argumentparser.log")); + FILE * f = fopen(filepath.data(), "a"); + if (f == NULL) + { + printf("impossible to write %s.\n", filepath.data()); + return; + } + fprintf(f, "%s\n", options.data()); + fclose(f); + } +}; diff --git a/hw4/skeleton/p1_p2/includes/SSABase.hpp b/hw4/skeleton/p1_p2/includes/SSABase.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1df106c13a81a84cad3d28f8bbc96a6b6e9b6908 --- /dev/null +++ b/hw4/skeleton/p1_p2/includes/SSABase.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class SSABase +{ +protected: + /** Multiplier for number of molecules. */ + const int omega; + /** Number of samples of stochastic algo to run. */ + const int numSamples; + /** End time for algorithm. */ + const double tend; + /** Width of trajectory bin. */ + const double bin_dt; + + /** Trajectories (binned) for S1*omega and S2*omega (trajS(i) = avg of S for t in [i,i+1]*bin_dt). */ + std::vector trajS1; + std::vector trajS2; + std::vector ntraj; + + /** Store timing result here. */ + double time; + +public: + SSABase(int _omega, int num_samples, double T, double bin_dt) + : omega(_omega) + , numSamples(num_samples) + , tend(T) + , bin_dt(bin_dt) + , trajS1(static_cast(std::ceil(T/bin_dt))) + , trajS2(static_cast(std::ceil(T/bin_dt))) + , ntraj(static_cast(std::ceil(T/bin_dt))) + , time(0.0) + { } + + /** Return time taken by start/stopTiming. */ + double getTime() { + return time; + } + /** Get omega. */ + double getOmega() { + return omega; + } + + /** + * Dump trajectories to file (nbins rows and 2 columns). + */ + void dumpTrajToFile(std::string filename) { + // Open file stream + std::ofstream outfile(filename.c_str()); + + // Dump it + const int nbins = trajS1.size(); + int totalevals = 0; + for (int i = 0; i < nbins; ++i) { + // must rescale + outfile << i*bin_dt+bin_dt/2 << " " + << (trajS1[i] / omega) << " " + << (trajS2[i] / omega) << std::endl; + totalevals += ntraj[i]; + } + std::cout << "Average number of time steps per sample: " << double(totalevals)/numSamples << std::endl; + } +}; diff --git a/hw4/skeleton/p1_p2/includes/SSABenchmark.hpp b/hw4/skeleton/p1_p2/includes/SSABenchmark.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8938a276a83a691921c173803f0bf820ab0f4b6c --- /dev/null +++ b/hw4/skeleton/p1_p2/includes/SSABenchmark.hpp @@ -0,0 +1,67 @@ +#include +#include +#include + +class SSABenchmark +{ +public: + template< typename SSAType> + static void timeIt(int omega, int num_samples, double T, double dt, int runs) + { +#ifdef _OPENMP + #pragma omp parallel +#endif + { + printf ("Printing one line per active thread....\n"); + } + + // init + vector times(runs); + vector flops(runs); + vector trfs(runs); + vector perfs(runs); + vector rs(runs); + + printf("\n==========================================================================\n"); + for (int i=0; i(runs*.05); + const int i40 = static_cast(runs*.4); + const int i60 = static_cast(runs*.6); + const int i95 = static_cast(runs*.95); + + std::cerr<<"==========================================================================\n"; + std::cerr<<"(05th perc.) time: "<< times[i05] <<", "<< flops[i05] <<" FLOP, Trf: "<< trfs[i05] <<" Byte, R: "< +#include + +#include "SSABase.hpp" +#include "Timer.hpp" + +using namespace std; + +class SSA_CPU : public SSABase +{ +private: + double k[4]; // rates + +protected: + Timer timer; + int pass; + +public: + SSA_CPU(const int omega, const int num_samples, const double T, const double bin_dt) + : SSABase(omega,num_samples,T,bin_dt) + ,pass(0) + { + k[0] = 1; + k[1] = 1; + k[2] = 0.2/omega; + k[3] = 20.0*omega; + } + + void operator()(); + void normalize_bins(); + + void setRates(double k1, double k2, double k3, double k4) { + k[0] = k1; + k[1] = k2; + k[2] = k3/omega; + k[3] = k4*omega; + } + double getS1() const { + return trajS1.back()/omega ; + }; + double getS2() const { + return trajS2.back()/omega ; + }; + + double getTransfers() const; + double getFlops() const; + + void startTiming() { + timer.start(); + } + void stopTiming() { + time += timer.stop(); + } +}; diff --git a/hw4/skeleton/p1_p2/includes/Timer.hpp b/hw4/skeleton/p1_p2/includes/Timer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..44182aaeee96aecb89bfdc607e20583326c6ec47 --- /dev/null +++ b/hw4/skeleton/p1_p2/includes/Timer.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +class Timer +{ + struct timeval t_start, t_end; + struct timezone t_zone; + +public: + + void start() + { + gettimeofday(&t_start, &t_zone); + } + + double stop() + { + gettimeofday(&t_end, &t_zone); + return (t_end.tv_usec - t_start.tv_usec)*1e-6 + (t_end.tv_sec - t_start.tv_sec); + } +}; diff --git a/hw4/skeleton/p1_p2/includes/objective.hpp b/hw4/skeleton/p1_p2/includes/objective.hpp new file mode 100644 index 0000000000000000000000000000000000000000..132a065c0ce1bce18a98a2f1c4d938148a359285 --- /dev/null +++ b/hw4/skeleton/p1_p2/includes/objective.hpp @@ -0,0 +1,27 @@ +#ifndef _DIRECT_HPP_ +#define _DIRECT_HPP_ + +#include "korali.hpp" +#include "SSA_CPU.hpp" + +void direct(korali::Sample& k) +{ + double k1 = k["Parameters"][0]; + double k2 = k["Parameters"][1]; + double k3 = k["Parameters"][2]; + double k4 = k["Parameters"][3]; + + // TODO: TASK 2b) + // - Initialize SSA_CPU class + // - set the rates k1, k2, k3, and k4 + // - run + // - get S1 and S2 + // - calculate objective function + + double sse = 0.0; // TODO + + k["F(x)"] = -sse; +} + + +#endif diff --git a/hw4/skeleton/p1_p2/main_cmaes.cpp b/hw4/skeleton/p1_p2/main_cmaes.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c22aa089d1727930e50debf6ba426d764cd1b0cd --- /dev/null +++ b/hw4/skeleton/p1_p2/main_cmaes.cpp @@ -0,0 +1,40 @@ +#include "includes/objective.hpp" +#include "korali.hpp" + +#include "SSA_CPU.hpp" + +int main(int argc, char* argv[]) +{ +// Configuring Problem. + auto e = korali::Experiment(); + e["Problem"]["Type"] = "Optimization/Stochastic"; + e["Problem"]["Objective Function"] = &direct; + +// Defining the problem's variables. + e["Variables"][0]["Name"] = "k1"; + e["Variables"][0]["Lower Bound"] = 0.0; + e["Variables"][0]["Upper Bound"] = 5.0; + + e["Variables"][1]["Name"] = "k2"; + e["Variables"][1]["Lower Bound"] = 0.0; + e["Variables"][1]["Upper Bound"] = 5.0; + + e["Variables"][2]["Name"] = "k3"; + e["Variables"][2]["Lower Bound"] = 0.0; + e["Variables"][2]["Upper Bound"] = 5.0; + + e["Variables"][3]["Name"] = "k4"; + e["Variables"][3]["Lower Bound"] = 0.0; + e["Variables"][3]["Upper Bound"] = 50.0; + + +// Configuring CMA-ES parameters + e["Solver"]["Type"] = "CMAES"; + e["Solver"]["Population Size"] = 8; + e["Solver"]["Termination Criteria"]["Min Value Difference Threshold"] = 1e-7; + e["Solver"]["Termination Criteria"]["Max Generations"] = 100; + + // Running Korali + auto k = korali::Engine(); + k.run(e); +} diff --git a/hw4/skeleton/p1_p2/main_ssa.cpp b/hw4/skeleton/p1_p2/main_ssa.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dd2a696b623203e6a5f6a2a7c70c5a9c6d143f65 --- /dev/null +++ b/hw4/skeleton/p1_p2/main_ssa.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "ArgumentParser.hpp" +#include "SSA_CPU.hpp" +#include "SSABenchmark.hpp" + +typedef std::chrono::system_clock Clock; + +using namespace std; + + +int main(int argc, const char ** argv) +{ + ArgumentParser parser(argc, argv); + + // params: + // -omega -> number of molecules -> i.e. result of i means i*omega mol. (def = 1) + // -samples -> how many samples of stoch. algo. to run (def = 2000) + // -T -> end time for algo (def = 5) + // -dt -> size of bin for trajectory binning (def = 0.1) + // -seed -> seed for random number generator (def = random) + // -runs -> how many repetitions for benchmarking (def = 100) + + auto now = Clock::now(); + auto ms = std::chrono::time_point_cast(now); + auto fraction = now - ms; + + // get params with def. values + int omega = parser("-omega").asInt(10); + int samples = parser("-samples").asInt(2000); + double T = parser("-T").asDouble(5.0); + double dt = parser("-dt").asDouble(0.1); + int rnd_seed = parser("-seed").asInt(fraction.count()); + int runs = parser("-runs").asInt(100); + + srand48(rnd_seed); + SSABenchmark::timeIt(omega, samples, T, dt, runs); + + return 0; +} diff --git a/hw4/skeleton/p1_p2/modules.src b/hw4/skeleton/p1_p2/modules.src new file mode 100644 index 0000000000000000000000000000000000000000..1d481a4dd9c4a4f27b282e1af062934dca05ce0c --- /dev/null +++ b/hw4/skeleton/p1_p2/modules.src @@ -0,0 +1,4 @@ +module purge +module load new +module load gcc/6.3.0 +module load python/3.7.1 diff --git a/hw4/skeleton/p1_p2/python/plot_output.py b/hw4/skeleton/p1_p2/python/plot_output.py new file mode 100755 index 0000000000000000000000000000000000000000..362699dd08b8f5aa81a953ff2409e588016c9321 --- /dev/null +++ b/hw4/skeleton/p1_p2/python/plot_output.py @@ -0,0 +1,26 @@ +#!/usr/local/bin/python3 +import matplotlib.pyplot as plt + + +fname = '../output.txt' + +if __name__ == '__main__': + print("Plotting results..") + + time = [] + s1 = [] + s2 = [] + with open(fname) as f: + for line in f: + t, a, b = line.split() + time.append(float(t)) + s1.append(float(a)) + s2.append(float(b)) + + + plt.plot(time, s1, label='S1') + plt.plot(time, s2, label='S2') + plt.legend() + plt.xlabel('Time') + plt.ylabel('Sepcies Quantity') + plt.show() diff --git a/hw4/skeleton/p3/Makefile b/hw4/skeleton/p3/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..1d0b2414bf8c0c4baf06e9a853e247dfbf3ac0fd --- /dev/null +++ b/hw4/skeleton/p3/Makefile @@ -0,0 +1,17 @@ +CXX = upcxx +CXXFLAGS += -O3 -std=c++17 +DEPS = factor.a + +all: sequential divideAndConquer producerConsumer + +sequential: sequential.cpp factor.a + $(CXX) $(CXXFLAGS) -o $@ $^ + +divideAndConquer: divideAndConquer.cpp factor.a + $(CXX) $(CXXFLAGS) -o $@ $^ + +producerConsumer: producerConsumer.cpp factor.a + $(CXX) $(CXXFLAGS) -o $@ $^ + + +clean:; rm -rf *.o diff --git a/hw4/skeleton/p3/divideAndConquer.cpp b/hw4/skeleton/p3/divideAndConquer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96379de03caa940a5dd638cd3121705852265d9c --- /dev/null +++ b/hw4/skeleton/p3/divideAndConquer.cpp @@ -0,0 +1,106 @@ +/********************************************************************** + * Code: UPC++ - Homework 3 + * Author: Vlachas Pantelis (pvlachas@ethz.ch) + * ETH Zuerich - HPCSE II (Spring 2020) + **********************************************************************/ + +// Loading necessary libraries +#include +#include +#include +#include + +#include +#include "factor.h" + +// using ofstream constructors. +#include +#include + +#define NUM_FACTORS 240000 + +int main(int argc, char* argv[]) +{ + // Measuring the total time needed. + auto start = std::chrono::system_clock::now(); + + // Intializing UPCXX + upcxx::init(); + int rankId = upcxx::rank_me(); + int rankCount = upcxx::rank_n(); + + if (rankId == 0){ + printf("Approximating the value of PI with %d series coefficients.\n", NUM_FACTORS); + } + int nFactors = NUM_FACTORS; + + // TODO: Specify the variables factorsPerRank, initFactor and endFactor + int factorsPerRank = 0; + int initFactor = 0; + int endFactor = 0; + + // TODO: Initialize a global pointer factorArray. Rank zero has to initialize the array. Do not forget to finally broadcast the global pointer to all ranks from rank 0 so that all ranks have access to the same global adress space + + + + + + + // TODO: After broadcasting the array, each rank needs to compute the portion of the factors it is assigned, and then \textbf{place} the result back to the \texttt{factorArray}. Do not use RPCs in this question, use the \texttt{upcxx::rput} command. + upcxx::future<> fut_all = upcxx::make_future(); + + + + + + + + auto end = std::chrono::system_clock::now(); + double rankTime = std::chrono::duration(end-start).count(); + + // Saving to a separate file for each rank + std::stringstream filename; + filename << "./Results/divide_and_conquer_time_rank_" << rankId << ".txt"; + std::string filenameStr = filename.str(); + std::ofstream outfile(filenameStr); + outfile << rankTime << "\n" << std::endl; + outfile.close(); + + // TODO ?: + + + + + // TODO: Finally, rank zero needs to compute the approximate value $\tilde{\pi}$ and save it to the results file, along with the total time. \textbf{Downcast} the global pointer to a local one and use it to compute the final approximation. + if (rankId == 0) + { + + double pi_approx = 0.0; + + + + + + + // Reporting the result + printf("PI approximate: %.17g\n", pi_approx); + printf("PI: %.10f\n", M_PI); + double error = abs(pi_approx - M_PI); + printf("Absolute error: %.17g\n", error); + + // Computing the total time + auto end = std::chrono::system_clock::now(); + double totalTime = std::chrono::duration(end-start).count(); + printf("Total Running Time: %.17gs\n", totalTime); + + // Saving the result and the total time + std::ofstream outfile ("./Results/divide_and_conquer.txt"); + outfile << pi_approx << "," << totalTime << "\n" << std::endl; + outfile.close(); + } + + // Finalize UPCXX + upcxx::finalize(); + + return 0; +} diff --git a/hw4/skeleton/p3/factor.a b/hw4/skeleton/p3/factor.a new file mode 100644 index 0000000000000000000000000000000000000000..f1ab503889cb0f39500c7c105cc19912c70836bd Binary files /dev/null and b/hw4/skeleton/p3/factor.a differ diff --git a/hw4/skeleton/p3/factor.h b/hw4/skeleton/p3/factor.h new file mode 100644 index 0000000000000000000000000000000000000000..74ea7966b6417d470918dc2c1786b2f3545a28b6 --- /dev/null +++ b/hw4/skeleton/p3/factor.h @@ -0,0 +1 @@ +double FACTOR(int k); \ No newline at end of file diff --git a/hw4/skeleton/p3/plot.py b/hw4/skeleton/p3/plot.py new file mode 100755 index 0000000000000000000000000000000000000000..ce8fd9d8544b63471cf0d315b56f4c379ff5aa64 --- /dev/null +++ b/hw4/skeleton/p3/plot.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# # -*- coding: utf-8 -*- +""" +Code: UPC++ - Homework 3 +Author: Vlachas Pantelis (pvlachas@ethz.ch) +ETH Zuerich - HPCSE II (Spring 2020) +""" +#!/usr/bin/env python + +############################################################################################# +# Function to plot the results of the runs +# Run with command: python3 plot.py --NUM_RANKS 24 --strategy divide_and_conquer +# Run with command: python3 plot.py --NUM_RANKS 24 --strategy producer_consumer +############################################################################################# +import numpy as np +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import os +from argparse import ArgumentParser + +def main(): + # GETTING THE NUMBER OF RANKS (USED TO RUN THE CODE) + parser = ArgumentParser() + parser.add_argument("-N", "--NUM_RANKS", dest="NUM_RANKS", type=int, help="number of ranks", required=True) + parser.add_argument("-S", "--strategy", dest="strategy", type=str, help="the strategy, either divide_and_conquer or producer_consumer", required=True) + args = parser.parse_args() + NUM_RANKS = args.NUM_RANKS + strategy = args.strategy + strategies = [strategy] + # strategies = ["divide_and_conquer", "producer_consumer"] + + # Make figures directory if it does not exist + os.makedirs("./Figures", exist_ok=True) + + if not os.path.isdir("./Results"): + raise ValueError("./Results directory does not exist. Did you created it before running the UPCXX programmes? Did you run the programmes?") + + + filename_base = "./Results/{:}_time_rank_{:}.txt" + for i in range(len(strategies)): + strategy = strategies[i] + title = strategy.replace("_", " ") + rank_times = [] + for rank_id in range(NUM_RANKS): + filetemp = filename_base.format(strategy, rank_id) + timetemp = np.loadtxt(filetemp) + rank_times.append(timetemp) + + rank_times = np.array(rank_times) + # print(rank_times) + plt.bar(np.arange(len(rank_times)), height=rank_times) + plt.title("Histogram of rank times for {:} tasking".format(title)) + plt.ylabel("Time [s]") + # plt.xticks(np.arange(len(rank_times))) + plt.xlabel("Rank id") + # plt.show() + plt.tight_layout() + fig_path = "./Figures/Ranktimes_histogram_{:}.png".format(strategy) + plt.savefig(fig_path) + plt.close() + + filename_base = "./Results/{:}.txt" + + time_sequential = np.loadtxt(filename_base.format("sequential"), delimiter=",")[1] + + strategies = ["divide_and_conquer", "producer_consumer"] + for strategy in strategies: + time = float(np.loadtxt(filename_base.format(strategy), delimiter=",")[1]) + # Computing speed-up + speed_up = time_sequential / time + print("Strategy: {:} - Speedup={:}".format(strategy, speed_up)) + + + + +if __name__ == '__main__': + main() + + + diff --git a/hw4/skeleton/p3/producerConsumer.cpp b/hw4/skeleton/p3/producerConsumer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..00463d83b6da5b8898f5f4d681ec76393541fd03 --- /dev/null +++ b/hw4/skeleton/p3/producerConsumer.cpp @@ -0,0 +1,165 @@ +/********************************************************************** + * Code: UPC++ - Homework 3 + * Author: Vlachas Pantelis (pvlachas@ethz.ch) + * ETH Zuerich - HPCSE II (Spring 2020) + **********************************************************************/ + +#include +#include +#include +#include +#include +#include "factor.h" + +// using ofstream constructors. +#include +#include + +#include + +#define NUM_FACTORS 240000 + +std::queue workers; + +upcxx::global_ptr factorArray; + +// Dummy function initialization +void master(int rankCount); +void worker(int rankId); + +bool continueFactorEvaluations; + +int main(int argc, char* argv[]) +{ + auto start = std::chrono::system_clock::now(); + + // Intializing UPCXX + upcxx::init(); + int rankId = upcxx::rank_me(); + int rankCount = upcxx::rank_n(); + + continueFactorEvaluations = true; + + if (rankId == 0){ + printf("Approximating the value of PI with %d series coefficients.\n", NUM_FACTORS); + } + int nFactors = NUM_FACTORS; + + // factorArray is now initialized as a global array. (please see some lines above) + // TODO: Similarly to the divide and conquer strategy, the master needs to initialize the array with the upcxx::new_array command. Then the array needs to be broadcasted so that all ranks have access and point to the shared address space. + + + + + + if (rankId == 0) master(rankCount); + else worker(rankId); + upcxx::barrier(); + + // TODO: Finally, the master rank needs to compute the approximate value $\tilde{\pi}$ and save it to the results file, along with the total time. \textbf{Downcast} the global pointer to a local one and use it to compute the final approximation. (Similarly to the divide and conquer strategy) + if (rankId == 0) + { + + double pi_approx = 0.0; + + + + + + printf("PI approximate: %.17g\n", pi_approx); + printf("PI: %.10f\n", M_PI); + double error = abs(pi_approx - M_PI); + printf("Absolute error: %.17g\n", error); + + auto end = std::chrono::system_clock::now(); + double rankTime = std::chrono::duration(end-start).count(); + + // Saving to a separate file for the master rank + std::stringstream filename; + filename << "./Results/producer_consumer" << ".txt"; + // assign to std::string + std::string filenameStr = filename.str(); + std::ofstream outfile(filenameStr); + outfile << pi_approx << "," << rankTime << "\n" << std::endl; + outfile.close(); + printf("Total Running Time: %.17gs\n", rankTime); + + // Saving to a separate file for each rank + std::stringstream filenameTime; + filenameTime << "./Results/producer_consumer_time_rank_" << rankId << ".txt"; + // assign to std::string + std::string filenameTimeStr = filenameTime.str(); + std::ofstream outfile2(filenameTimeStr); + outfile2 << rankTime << "\n" << std::endl; + outfile2.close(); + } + + + // Finalize UPCXX + upcxx::finalize(); + + return 0; +} + +void workerComputeFactor(int rankId, int factorId) +{ + // TODO: Compute factor + + + // TODO: As a worker, use a RPC to update the master's factorArray with the computed factor, and notify the master that you are again available to compute by pushing your rankId to the queue + + + + + + +} + +void master(int rankCount) +{ + // Creating a queue with all the ranks, without master rank ! + for (int rank = 1; rank < rankCount; rank++) workers.push(rank); + + // Sending the factors to be computed in the ranks + for (int factorId = NUM_FACTORS-1; factorId >= 0; --factorId) + { + // Waiting for a rank to become available + // When no rank available call progress() + while(workers.empty()) upcxx::progress(); + //TODO: Whenever any worker is available, the master has to get his \texttt{workerId}, by popping out the first \texttt{workerId} from the queue of the available workers. + + // TODO: After identifying an available worker, the master needs to send an RPC, the task that the consumer has to complete. This is no other than the workerComputeFactor(workerId, factorId) + + } + // Master is notifying the workers to stop the evaluation of factors + for (int i = 1; i < rankCount; i++) upcxx::rpc_ff(i, [](){continueFactorEvaluations = false;}); +} + + +void worker(int rankId) +{ + // Worker is calling progress(); until the master sends an RPC notifying to stop the evaluation of any more factors + auto t0 = std::chrono::system_clock::now(); + while(continueFactorEvaluations) + { + upcxx::progress(); + } + auto t1 = std::chrono::system_clock::now(); + double rankTime = std::chrono::duration(t1-t0).count(); + + // Saving the runtime of each rank to a separate file + std::stringstream filename; + filename << "./Results/producer_consumer_time_rank_" << rankId << ".txt"; + // assign to std::string + std::string filenameStr = filename.str(); + std::ofstream outfile(filenameStr); + outfile << rankTime << "\n" << std::endl; + outfile.close(); +} + + + + + + + diff --git a/hw4/skeleton/p3/sequential.cpp b/hw4/skeleton/p3/sequential.cpp new file mode 100644 index 0000000000000000000000000000000000000000..56efafd40ead3bbff50821c123090dacfb92a474 --- /dev/null +++ b/hw4/skeleton/p3/sequential.cpp @@ -0,0 +1,54 @@ +/********************************************************************** + * Code: UPC++ - Homework 3 + * Author: Vlachas Pantelis (pvlachas@ethz.ch) + * ETH Zuerich - HPCSE II (Spring 2020) + **********************************************************************/ + +// Loading necessary libraries +#include +#include +#include +#include + +// Library of the factor F(n) +#include "factor.h" + +// ofstream constructors. +#include +#include + +#define NUM_FACTORS 240000 + +int main(int argc, char* argv[]) +{ + printf("Approximating the value of PI with %d series coefficients.\n", NUM_FACTORS); + + // Measuring the total time needed. + auto start = std::chrono::system_clock::now(); + + double pi_approx = 0.0; + for(int k=1; k<=NUM_FACTORS; ++k){ + pi_approx += FACTOR(k); + } + pi_approx = 4. * pi_approx; + auto end = std::chrono::system_clock::now(); + + printf("PI approximate: %.17g\n", pi_approx); + printf("PI: %.17g\n", M_PI); + double error = abs(pi_approx - M_PI); + printf("Absolute error: %.17g\n", error); + + double totalTime = std::chrono::duration(end-start).count(); + printf("Total Running Time: %.17g s\n", totalTime); + + // Saving the results to a file + std::ofstream outfile1 ("./Results/sequential_time.txt"); + outfile1 << totalTime << "\n" << std::endl; + outfile1.close(); + + std::ofstream outfile2 ("./Results/sequential.txt"); + outfile2 << pi_approx << ", " << totalTime << "\n" << std::endl; + outfile2.close(); + + return 0; +}