diff --git a/hw2/solution/p1/Makefile b/hw2/solution/p1/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4a7d586573dd3ab2102693c7aa1d1dcd2c00986a
--- /dev/null
+++ b/hw2/solution/p1/Makefile
@@ -0,0 +1,19 @@
+CXX=h5pcc
+CXXFLAGS = -Wpedantic -Wall -Wextra -std=c++11 -lstdc++ -O3 -g
+
+all: main
+
+%.o: %.cpp
+ $(CXX) -c -o $@ $< $(CXXFLAGS)
+clean:
+ rm -rf *.o *.xmf *h5 *txt main
+
+main: main.o auxiliary.o wave.o
+ $(CXX) $(CXXFLAGS)-I. -o main $^
+
+run:
+ make clean ; make ; mpirun -n 8 ./main 100 2 1.0
+run1:
+ make clean ; make ; mpirun -n 1 ./main 100 1 1.0
+
+.PHONY: all clean run run1
diff --git a/hw2/solution/p1/README b/hw2/solution/p1/README
new file mode 100644
index 0000000000000000000000000000000000000000..08d42dbd0ec10656b73d356c2e371162a8b5fe8d
--- /dev/null
+++ b/hw2/solution/p1/README
@@ -0,0 +1,33 @@
+HOW TO RUN
+----------
+To run the code on Euler remove all previously loaded nodules (purge) and load the following modules:
+
+module purge;
+module load new;
+module load gcc;
+module load open_mpi/3.0.0;
+module load hdf5
+
+You can request 8 cores and an interative shell to work with:
+bsub -n 8 -W 04:00 -Is bash
+
+Then compile and run by typing
+make run
+
+
+WHAT TO MODIFY
+--------------
+You only need to change the code in wave.cpp
+When the code runs, it prints a checksum. This number should be the same for the solution you provide.
+
+
+CODE OUTPUT
+-----------
+You are not asked to visualize any results in this question, but we thought I'd be nice if you can see what you are actually solving.
+Other outputs are .h5 files which can be used to visualize the solution. Each .h5 file corresponds to an .xmf file.
+You can use Paraview to open the .xmf files and visualize the results.
+Alternatively, you may comment out the line
+#define USE_HDF5
+with '//' in wave.h. In this case, the code will output simple .txt files with the solution, saved as
+x y z u
+...
diff --git a/hw2/solution/p1/auxiliary.cpp b/hw2/solution/p1/auxiliary.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef943bab2c8ef68bd29f1aa2aca90f8da4309052
--- /dev/null
+++ b/hw2/solution/p1/auxiliary.cpp
@@ -0,0 +1,244 @@
+#include "wave.h"
+
+WaveEquation::WaveEquation(int a_N, int a_procs_per_dim)
+{
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+ procs_per_dim = a_procs_per_dim;
+ Ntot = a_N;
+ h = L / a_N;
+ N = a_N / procs_per_dim;
+
+ // the chosen numerical method is stable if dt <= h/sqrt(3)
+ dt = h / sqrt(3.0);
+
+ FindCoordinates();
+
+ origin[0] = coords[0] * N * h;
+ origin[1] = coords[1] * N * h;
+ origin[2] = coords[2] * N * h;
+ u = new double[(N + 2) * (N + 2) * (N + 2)];
+ for (int i0 = 0; i0 < N; i0++)
+ {
+ double x0 = origin[0] + i0 * h + 0.5 * h;
+ for (int i1 = 0; i1 < N; i1++)
+ {
+ double x1 = origin[1] + i1 * h + 0.5 * h;
+ for (int i2 = 0; i2 < N; i2++)
+ {
+ double x2 = origin[2] + i2 * h + 0.5 * h;
+ u[(i0 + 1) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2 + 1)] =
+ Initial_Condition(x0, x1, x2);
+ }
+ }
+ }
+
+ u_old = new double[(N + 2) * (N + 2) * (N + 2)];
+ u_new = new double[(N + 2) * (N + 2) * (N + 2)];
+
+ for (int i0 = 1; i0 <= N; i0++)
+ for (int i1 = 1; i1 <= N; i1++)
+ for (int i2 = 1; i2 <= N; i2++)
+ {
+ int m = i2 + i1 * (N + 2) + i0 * (N + 2) * (N + 2);
+ u_new[m] = u[m];
+ u_old[m] = u[m];
+ // assuming that u_old = u is equivalent to du/dt(t=0) = 0
+ }
+
+ aux = dt * dt / h / h;
+}
+
+WaveEquation::~WaveEquation()
+{
+ delete[] u;
+ delete[] u_old;
+ delete[] u_new;
+}
+
+double WaveEquation::Initial_Condition(double x0, double x1, double x2)
+{
+ double r =
+ (x0 - 0.5) * (x0 - 0.5) + (x1 - 0.5) * (x1 - 0.5) + (x2-0.5)*(x2-0.5);
+ return exp(-r / 0.1);
+}
+
+// do not change this function
+void WaveEquation::Print(int kt = 0)
+{
+#ifdef USE_HDF5
+
+ std::string name = "Wave__" + std::to_string(kt);
+ if (rank == 0)
+ {
+ FILE *xmf = 0;
+ xmf = fopen((name + ".xmf").c_str(), "w");
+
+ std::stringstream s_head;
+
+ s_head << "\n";
+ s_head << "\n";
+ s_head << "\n";
+ s_head << " \n";
+
+ s_head << " \n\n";
+
+ fprintf(xmf, (s_head.str()).c_str());
+
+ std::stringstream s;
+ s << " \n";
+ s << " \n\n";
+ s << " \n";
+ s << " \n";
+ s << " " << 0 << " " << 0 << " " << 0 << "\n";
+ s << " \n";
+ s << " \n";
+ s << " " << std::setprecision(10) << std::setw(15) << h
+ << " " << std::setw(15) << h << " " << std::setw(15) << h << "\n";
+ s << " \n";
+ s << " \n\n";
+
+ s << " \n";
+ s << " \n";
+ std::stringstream name_ss;
+ name_ss << "data"; // << std::setfill('0') << std::setw(10) << mm ;
+ std::string tmp = name_ss.str();
+ s << " " << (name + ".h5").c_str() << ":/" << tmp << "\n";
+ s << " \n";
+ s << " \n";
+
+ s << " \n";
+
+ std::string st = s.str();
+
+ fprintf(xmf, st.c_str());
+
+ std::stringstream s_tail;
+ s_tail << " \n";
+ s_tail << "\n";
+
+ fprintf(xmf, (s_tail.str()).c_str());
+ fclose(xmf);
+ }
+
+ hid_t file_id, dataset_id, fspace_id, fapl_id, mspace_id;
+
+ H5open();
+ fapl_id = H5Pcreate(H5P_FILE_ACCESS);
+
+ H5Pset_fapl_mpio(fapl_id, cart_comm, MPI_INFO_NULL);
+
+ file_id =
+ H5Fcreate((name + ".h5").c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+
+ H5Pclose(fapl_id);
+
+ double *array_all = new double[N * N * N];
+
+ hsize_t count[4] = {static_cast(N),
+ static_cast(N),
+ static_cast(N), 1};
+ hsize_t dims[4] = {static_cast(Ntot),
+ static_cast(Ntot),
+ static_cast(Ntot), 1};
+ hsize_t offset[4] = {static_cast(coords[0] * N),
+ static_cast(coords[1] * N),
+ static_cast(coords[2] * N), 0};
+
+ for (int i0 = 1; i0 <= N; i0++)
+ {
+ for (int i1 = 1; i1 <= N; i1++)
+ {
+ for (int i2 = 1; i2 <= N; i2++)
+ {
+ array_all[(i2 - 1) + (i1 - 1) * N + (i0 - 1) * N * N] =
+ u[i2 + i1 * (N + 2) + i0 * (N + 2) * (N + 2)];
+ }
+ }
+ }
+
+ fapl_id = H5Pcreate(H5P_DATASET_XFER);
+ H5Pset_dxpl_mpio(fapl_id, H5FD_MPIO_COLLECTIVE);
+ fspace_id = H5Screate_simple(4, dims, NULL);
+
+ dataset_id = H5Dcreate(file_id, "data", H5T_NATIVE_DOUBLE, fspace_id,
+ H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+ fspace_id = H5Dget_space(dataset_id);
+
+ H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);
+
+ mspace_id = H5Screate_simple(4, count, NULL);
+
+ H5Dwrite(dataset_id, H5T_NATIVE_DOUBLE, mspace_id, fspace_id, fapl_id,
+ array_all);
+
+ H5Sclose(mspace_id);
+ H5Sclose(fspace_id);
+ H5Dclose(dataset_id);
+ H5Pclose(fapl_id);
+ H5Fclose(file_id);
+ H5close();
+
+ delete[] array_all;
+
+#else
+
+ std::string name = "Wave__" + std::to_string(kt) + ".txt";
+ MPI_File file;
+ MPI_File_open(cart_comm, name.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE,
+ MPI_INFO_NULL, &file);
+
+ std::stringstream ss;
+
+ for (int i0 = 0; i0 < N; i0++)
+ {
+ double x0 = origin[0] + i0 * h + 0.5 * h;
+ for (int i1 = 0; i1 < N; i1++)
+ {
+ double x1 = origin[1] + i1 * h + 0.5 * h;
+ for (int i2 = 0; i2 < N; i2++)
+ {
+ double x2 = origin[2] + i2 * h + 0.5 * h;
+ ss << x0 << " " << x1 << " " << x2 << " "
+ << u[(i2 + 1) + (i1 + 1) * (N + 2) + (i0 + 1) * (N + 2) * (N + 2)]
+ << "\n";
+ }
+ }
+ }
+
+ std::string asciiData = ss.str();
+
+ MPI_Offset my_length = asciiData.size() * sizeof(char);
+
+ MPI_Offset offset;
+ MPI_Exscan(&my_length, &offset, 1, MPI_OFFSET, MPI_SUM, cart_comm);
+
+ MPI_File_write_at_all(file, offset, asciiData.data(), asciiData.size(),
+ MPI_CHAR, MPI_STATUS_IGNORE);
+ MPI_File_close(&file);
+
+#endif
+}
+
+// do not change this function
+void WaveEquation::UpdateGridPoint(int i0, int i1, int i2)
+{
+ int m = i0 * (N + 2) * (N + 2) + i1 * (N + 2) + i2;
+ int ip1 = (i0 + 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2);
+ int im1 = (i0 - 1) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2);
+ int jp1 = (i0) * (N + 2) * (N + 2) + (i1 + 1) * (N + 2) + (i2);
+ int jm1 = (i0) * (N + 2) * (N + 2) + (i1 - 1) * (N + 2) + (i2);
+ int kp1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 + 1);
+ int km1 = (i0) * (N + 2) * (N + 2) + (i1) * (N + 2) + (i2 - 1);
+ u_new[m] =
+ 2.0 * u[m] - u_old[m] +
+ aux * (u[ip1] + u[im1] + u[jp1] + u[jm1] + u[kp1] + u[km1] - 6.0 * u[m]);
+}
diff --git a/hw2/solution/p1/main.cpp b/hw2/solution/p1/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be8512ff4e96e71b0edf3d5827a366651484df09
--- /dev/null
+++ b/hw2/solution/p1/main.cpp
@@ -0,0 +1,61 @@
+#include "wave.h"
+
+int main(int argc, char **argv)
+{
+ MPI_Init(&argc, &argv);
+
+ int rank, size;
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ if (argc != 4)
+ {
+ if (rank == 0)
+ {
+ std::cout << " Incorrect number of inputs. Run as:\n";
+ std::cout << " mpirun -n x ./main N y t \n";
+ std::cout << " where \n";
+ std::cout << " N= number of grid points per direction\n";
+ std::cout << " y= number of ranks per direction (=(x)^(1/3))\n";
+ std::cout << " t= final time of simulation \n";
+ std::cout << " Aborting...\n";
+ }
+ int err = 1;
+ MPI_Abort(MPI_COMM_WORLD, err);
+ }
+
+ int points = atoi(argv[1]);
+ int procs_per_dim = atoi(argv[2]);
+ double t_end = atof(argv[3]);
+
+ if (size != procs_per_dim * procs_per_dim * procs_per_dim)
+ {
+ if (rank == 0)
+ std::cout << " Incorrect number of ranks per direction. Aborting... \n";
+ int err = 2;
+ MPI_Abort(MPI_COMM_WORLD, err);
+ }
+
+ if (points % procs_per_dim != 0)
+ {
+ if (rank == 0)
+ std::cout << " Grid points per direction must be divisible by the number "
+ "of ranks per direction. Aborting... \n";
+ int err = 3;
+ MPI_Abort(MPI_COMM_WORLD, err);
+ }
+
+ WaveEquation *Simulation = new WaveEquation(points, procs_per_dim);
+
+ double start = MPI_Wtime();
+ Simulation->run(t_end);
+ double finish = MPI_Wtime();
+
+ if (rank == 0)
+ std::cout << "Total time = " << finish - start << "\n";
+
+ delete Simulation;
+
+ MPI_Finalize();
+ return 0;
+}
\ No newline at end of file
diff --git a/hw2/solution/p1/solution.cpp b/hw2/solution/p1/solution.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f86233a583520a5b33fe2ec58e0c4f5c5c4336a
--- /dev/null
+++ b/hw2/solution/p1/solution.cpp
@@ -0,0 +1,485 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define USE_HDF5
+
+#ifdef USE_HDF5
+#include
+#endif
+
+using namespace std;
+
+#define L 1.0
+
+struct WaveEquation
+{
+ int N; //grid points per dimension
+ double h; //grid spacing (dx = dy = dz = h)
+ double dt; //timestep
+ double t; //current time
+ double * u; //solution vector
+ double * u_old;
+ double * u_new;
+
+ int Ntot;
+ int procs_per_dim;
+
+ int size;
+ int rank;
+
+ int rank_plus [3];
+ int rank_minus [3];
+
+ MPI_Comm cart_comm;
+
+ double origin[3];
+ double aux;
+ int coords[3];
+
+ ~WaveEquation()
+ {
+ delete [] u;
+ delete [] u_old;
+ delete [] u_new;
+ MPI_Comm_free(&cart_comm);
+ }
+
+ WaveEquation (int a_N, int a_procs_per_dim)
+ {
+ MPI_Comm_size(MPI_COMM_WORLD,&size);
+
+
+ procs_per_dim = a_procs_per_dim;
+ Ntot = a_N;
+
+ h = L / a_N;
+ N = a_N / procs_per_dim;
+
+ dt = h / sqrt(3.0);
+
+ int nums [3] = {procs_per_dim,procs_per_dim,procs_per_dim};
+ int periodic[3] = {true,true,true};
+ //int periodic[3] = {false,false,false};
+
+ MPI_Cart_create(MPI_COMM_WORLD, 3, nums, periodic, true, &cart_comm);
+ MPI_Comm_rank(cart_comm,&rank);
+
+ MPI_Cart_shift(cart_comm, 0, 1, &rank_plus[0], &rank_minus[0]);
+ MPI_Cart_shift(cart_comm, 1, 1, &rank_plus[1], &rank_minus[1]);
+ MPI_Cart_shift(cart_comm, 2, 1, &rank_plus[2], &rank_minus[2]);
+
+ MPI_Cart_coords(cart_comm,rank,3,&coords[0]);
+
+
+ origin[0] = coords[0] * N * h;
+ origin[1] = coords[1] * N * h;
+ origin[2] = coords[2] * N * h;
+
+ u = new double [(N+2)*(N+2)*(N+2)];
+
+ for (int i0=0; i0\n";
+ s_head << "\n";
+ s_head << "\n";
+ s_head << " \n";
+
+ s_head << " \n";
+ s_tail << "\n";
+
+ fprintf(xmf, (s_tail.str()).c_str());
+ fclose(xmf);
+ }
+
+ hid_t file_id, dataset_id, fspace_id, fapl_id, mspace_id;
+
+ H5open();
+ fapl_id = H5Pcreate(H5P_FILE_ACCESS);
+
+ H5Pset_fapl_mpio(fapl_id, cart_comm, MPI_INFO_NULL);
+
+ file_id = H5Fcreate((name + ".h5").c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+
+ H5Pclose(fapl_id);
+
+ double * array_all = new double[N * N * N];
+
+ hsize_t count [4] = {static_cast(N) , static_cast(N) , static_cast(N) , 1};
+ hsize_t dims [4] = {static_cast(Ntot) , static_cast(Ntot) , static_cast(Ntot) , 1};
+ hsize_t offset[4] = {static_cast(coords[0] * N), static_cast(coords[1] * N), static_cast(coords[2] * N), 0};
+
+ for(int i0=1; i0<=N; i0++)
+ {
+ for(int i1=1; i1<=N; i1++)
+ {
+ for(int i2=1; i2<=N; i2++)
+ {
+ array_all[(i2-1)+(i1-1)*N + (i0-1)*N*N] = u[i2 + i1*(N+2)+i0*(N+2)*(N+2)];
+ }
+ }
+ }
+
+ fapl_id = H5Pcreate(H5P_DATASET_XFER);
+ H5Pset_dxpl_mpio(fapl_id, H5FD_MPIO_COLLECTIVE);
+ fspace_id = H5Screate_simple(4, dims, NULL);
+
+ dataset_id = H5Dcreate(file_id, "data", H5T_NATIVE_DOUBLE, fspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+ fspace_id = H5Dget_space(dataset_id);
+
+ H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, count, NULL);
+
+ mspace_id = H5Screate_simple(4, count, NULL);
+
+
+ H5Dwrite(dataset_id, H5T_NATIVE_DOUBLE, mspace_id, fspace_id, fapl_id, array_all);
+
+ H5Sclose(mspace_id );
+ H5Sclose(fspace_id );
+ H5Dclose(dataset_id);
+ H5Pclose(fapl_id );
+ H5Fclose(file_id );
+ H5close();
+
+ delete [] array_all;
+
+ #else
+
+ string a_name = "Wave__" + to_string(kt);
+ char name [30] = a_name.c_str();
+ MPI_File file;
+ MPI_File_open( cart_comm, name, MPI_MODE_RDWR | MPI_MODE_CREATE , MPI_INFO_NULL, &file);
+
+ stringstream ss;
+
+
+ for (int i0=0; i0run(t_end);
+ double finish = MPI_Wtime();
+
+ if (rank==0)std::cout << "Total time = " << finish - start << "\n";
+
+ delete Simulation;
+
+ MPI_Finalize();
+ return 0;
+}
\ No newline at end of file
diff --git a/hw2/solution/p1/wave.cpp b/hw2/solution/p1/wave.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3a2ded01faf0e4a080442618b468b7555a66444
--- /dev/null
+++ b/hw2/solution/p1/wave.cpp
@@ -0,0 +1,151 @@
+#include "wave.h"
+
+/********************************************************************/
+/* Subquestion a: change the following function and use a Cartesian */
+/* topology to find coords[3], rank_plus[3] and rank_minus[3] */
+/********************************************************************/
+void WaveEquation::FindCoordinates()
+{
+ int nums [3] = {procs_per_dim,procs_per_dim,procs_per_dim};
+ int periodic[3] = {true,true,true};
+ //int periodic[3] = {false,false,false};
+
+ MPI_Cart_create(MPI_COMM_WORLD, 3, nums, periodic, true, &cart_comm);
+ MPI_Comm_rank(cart_comm,&rank);
+
+ MPI_Cart_shift(cart_comm, 0, 1, &rank_plus[0], &rank_minus[0]);
+ MPI_Cart_shift(cart_comm, 1, 1, &rank_plus[1], &rank_minus[1]);
+ MPI_Cart_shift(cart_comm, 2, 1, &rank_plus[2], &rank_minus[2]);
+
+ MPI_Cart_coords(cart_comm,rank,3,&coords[0]);
+}
+
+void WaveEquation::run(double t_end)
+{
+ t = 0;
+
+ /********************************************************************/
+ /* Subquestion b: you need to define 12 custom datatypes. */
+ /* For sending data, six datatypes (one per face) are required. */
+ /* For receiving data, six more datatypes are required. */
+ /* You should use MPI_Type_create_subarray for those datatypes. */
+ /********************************************************************/
+
+ /* Subquestion b: Create and commit custom datatypes here */
+ /************************************************************************************************/
+ MPI_Datatype SEND_FACE_PLUS[3];
+ MPI_Datatype SEND_FACE_MINUS[3];
+
+ MPI_Datatype RECV_FACE_PLUS[3];
+ MPI_Datatype RECV_FACE_MINUS[3];
+
+ int ndims = 3;
+ int order = MPI_ORDER_C;
+ int array_of_sizes [3] = {N+2,N+2,N+2};
+ int array_of_subsizes [3];
+ int array_of_starts [3];
+ for (int d0 = 0; d0 < 3; d0++)
+ {
+ int d1 = (d0+1)%3;
+ int d2 = (d0+2)%3;
+
+ array_of_subsizes[d0] = 1;
+ array_of_subsizes[d1] = N;
+ array_of_subsizes[d2] = N;
+
+ array_of_starts[d0] = 1;
+ array_of_starts[d1] = 1;
+ array_of_starts[d2] = 1;
+ MPI_Type_create_subarray(ndims,array_of_sizes,array_of_subsizes,array_of_starts,order,MPI_DOUBLE,&SEND_FACE_MINUS[d0]);
+
+ array_of_starts[d0] = N;
+ array_of_starts[d1] = 1;
+ array_of_starts[d2] = 1;
+ MPI_Type_create_subarray(ndims,array_of_sizes,array_of_subsizes,array_of_starts,order,MPI_DOUBLE,&SEND_FACE_PLUS[d0]);
+
+
+ array_of_starts[d0] = 0;
+ array_of_starts[d1] = 1;
+ array_of_starts[d2] = 1;
+ MPI_Type_create_subarray(ndims,array_of_sizes,array_of_subsizes,array_of_starts,order,MPI_DOUBLE,&RECV_FACE_MINUS[d0]);
+
+ array_of_starts[d0] = N+1;
+ array_of_starts[d1] = 1;
+ array_of_starts[d2] = 1;
+ MPI_Type_create_subarray(ndims,array_of_sizes,array_of_subsizes,array_of_starts,order,MPI_DOUBLE,&RECV_FACE_PLUS[d0]);
+
+ MPI_Type_commit(&SEND_FACE_PLUS [d0]); MPI_Type_commit(&RECV_FACE_PLUS [d0]);
+ MPI_Type_commit(&SEND_FACE_MINUS[d0]); MPI_Type_commit(&RECV_FACE_MINUS[d0]);
+ }
+ /************************************************************************************************/
+
+ int count = 0;
+ do
+ {
+ if (count % 5 == 0)
+ {
+ if (rank == 0)
+ std::cout << count << " t=" << t << "\n";
+ Print(count);
+ }
+
+ MPI_Request request[12];
+
+ /* Subquestion b: Replace the sends and receives with ones that correspond
+ * to custom datatypes*/
+ /**********************************************************************************************/
+ for (int d=0; d<3; d++)
+ {
+ MPI_Irecv(u,1,RECV_FACE_MINUS[d],rank_minus[d],100*d ,cart_comm,&request[2*d ]);
+ MPI_Irecv(u,1,RECV_FACE_PLUS [d],rank_plus [d],100*d+1,cart_comm,&request[2*d+1]);
+ }
+
+ for (int d=0; d<3; d++)
+ {
+ MPI_Isend(u,1,SEND_FACE_PLUS [d],rank_plus [d],100*d ,cart_comm,&request[6+2*d ]);
+ MPI_Isend(u,1,SEND_FACE_MINUS[d],rank_minus[d],100*d+1,cart_comm,&request[6+2*d+1]);
+ }
+ /**********************************************************************************************/
+
+ // Wait for communication to finish
+ MPI_Waitall(12, &request[0], MPI_STATUSES_IGNORE);
+
+
+ for (int i0 = 1; i0 <= N; i0++)
+ for (int i1 = 1; i1 <= N; i1++)
+ for (int i2 = 1; i2 <= N; i2++)
+ UpdateGridPoint(i0, i1, i2);
+
+ double *temp = u_old;
+ u_old = u;
+ u = u_new;
+ u_new = temp;
+ t += dt;
+ count++;
+ } while (t < t_end);
+
+ double s = 0;
+ double Checksum = 0;
+ for (int k = 1; k <= N; k++)
+ for (int j = 1; j <= N; j++)
+ for (int i = 1; i <= N; i++)
+ {
+ int m = k + j * (N + 2) + i * (N + 2) * (N + 2);
+ s += u[m] * u[m];
+ }
+
+ MPI_Reduce(&s, &Checksum, 1, MPI_DOUBLE, MPI_SUM, 0, cart_comm);
+ if (rank == 0)
+ std::cout << "Checksum = " << Checksum << "\n";
+
+ /* Subquestion b: You should free the custom datatypes and the communicator
+ * here. */
+ for (int d=0; d<3; d++)
+ {
+ MPI_Type_free(&RECV_FACE_PLUS[d]) ;
+ MPI_Type_free(&RECV_FACE_MINUS[d]);
+ MPI_Type_free(&SEND_FACE_PLUS[d]) ;
+ MPI_Type_free(&SEND_FACE_MINUS[d]);
+ }
+ MPI_Comm_free(&cart_comm);
+}
\ No newline at end of file
diff --git a/hw2/solution/p1/wave.h b/hw2/solution/p1/wave.h
new file mode 100644
index 0000000000000000000000000000000000000000..b613b6af5b8dddb01da289ebfc538d3514b52429
--- /dev/null
+++ b/hw2/solution/p1/wave.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define USE_HDF5 // comment this line if you do not want hdf5 file output
+ // (suitable for visualization with Paraview)
+
+#ifdef USE_HDF5
+#include
+#endif
+
+
+#define L 1.0 // domain size (unit cube) with Ntot x Ntot x Ntot grid points
+
+struct WaveEquation
+{
+ int N; // grid points per direction for this rank
+ double h; // grid spacing (dx = dy = dz = h)
+ double dt; // timestep
+ double t; // current time
+ double *u; // solution vector
+ double *u_old;
+ double *u_new;
+
+ int Ntot; // total grid points per direction
+ int procs_per_dim; // ranks per direction
+
+ int size; // =total ranks = procs_per_dim*procs_per_dim*procs_per_dim
+ int rank;
+
+ int rank_plus[3]; // neighboring ranks
+ int rank_minus[3]; //
+
+ MPI_Comm cart_comm; // should be a Cartesian topology communicator
+
+ double origin[3]; // physical space (x0,x1,x2) coordinates of 1st grid point
+ // of this rank
+ int coords[3]; // index space coordinates of this rank
+
+ double aux;
+
+ ~WaveEquation();
+ WaveEquation(int a_N, int a_procs_per_dim);
+ void FindCoordinates();
+ double Initial_Condition(double x0, double x1, double x2);
+ void UpdateGridPoint(int i0, int i1, int i2);
+ void Print(int kt);
+ void pack_face(double *pack, int array_of_sizes[3], int array_of_subsizes[3],
+ int array_of_starts[3]);
+ void unpack_face(double *pack, int array_of_sizes[3],
+ int array_of_subsizes[3], int array_of_starts[3]);
+ void run(double t_end);
+};
diff --git a/hw2/solution/p2/.gitignore b/hw2/solution/p2/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1585d5aeeb13ccf24b680417b7c4af287492e9a2
--- /dev/null
+++ b/hw2/solution/p2/.gitignore
@@ -0,0 +1,2 @@
+cannon
+*.o
diff --git a/hw2/solution/p2/Makefile b/hw2/solution/p2/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..19cb8a897d730f3e4e0c40ed17833d109aa9171f
--- /dev/null
+++ b/hw2/solution/p2/Makefile
@@ -0,0 +1,26 @@
+CFLAGS = -O3
+LDFLAGS = -lopenblas
+CC =$(shell which mpicxx)
+
+.SECONDARY:
+BINARIES = cannon
+
+.PHONY: all auxiliar
+
+all: auxiliar $(BINARIES)
+
+auxiliar:
+ $(MAKE) -C auxiliar
+
+cannon: cannon.o auxiliar/auxiliar.o
+ $(CC) -o $@ $^ $(LDFLAGS)
+
+%.o: %.cpp
+ $(CC) -c $(CFLAGS) $<
+
+.PHONY: clean
+clean:
+ $(RM) $(BINARIES) auxiliar/*.o *.o *.ti
+ $(MAKE) -C auxiliar clean
+
+
diff --git a/hw2/solution/p2/auxiliar/Makefile b/hw2/solution/p2/auxiliar/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..728e3e1e276792049e691ed33413d299b401ad26
--- /dev/null
+++ b/hw2/solution/p2/auxiliar/Makefile
@@ -0,0 +1,18 @@
+CFLAGS = -O3
+LDFLAGS = -lblas
+CC =$(shell which mpicxx)
+
+.SECONDARY:
+OBJECTS = auxiliar.o
+
+.PHONY: all
+all: $(OBJECTS)
+
+%.o: %.cpp
+ $(CC) -c $(CFLAGS) $<
+
+.PHONY: clean
+clean:
+ $(RM) $(BINARIES) *.o *.ti
+
+
diff --git a/hw2/solution/p2/auxiliar/auxiliar.cpp b/hw2/solution/p2/auxiliar/auxiliar.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a90492fd4e213feac39775856fadbaf6a8c12caa
--- /dev/null
+++ b/hw2/solution/p2/auxiliar/auxiliar.cpp
@@ -0,0 +1,46 @@
+#include "auxiliar.hpp"
+#include
+#include
+
+void initializeMatrices(double* A, double* B, const int n, const int N, const int rankx, const int ranky)
+{
+ //initialize value and align for A, B
+ double *aptr, *bptr;
+ aptr = A; bptr = B;
+ for(int i = n*rankx; i < n*(rankx+1); i++)
+ for(int j = n*ranky; j < n*(ranky+1); j++)
+ {
+ *aptr = 1.0/((i +ranky*n)%N + j +1);
+ *bptr = 1.0/(i+ (j+rankx*n)%N +1);
+ aptr++; bptr++;
+ }
+}
+
+void verifyMatMulResults(double* C, const int n, const int N, const int rankx, const int ranky, double execTime)
+{
+ if (rankx == 0 && ranky == 0) printf("Verifying Result... ");
+
+ double tolerance = 1e-6;
+ int error = 0;
+ double* cptr = C;
+ for(int i = n*rankx; i < n*(rankx+1) && !error; i++)
+ for(int j = n*ranky; j < n*(ranky+1) && !error; j++, cptr++)
+ {
+ double tmp = 0;
+ for(int k = 0; k < N; k++) tmp += 1.0/((i+k+1)*(k+j+1));
+ error = fabs(*cptr-tmp) > tolerance;
+ }
+ int tempErr = error;
+ MPI_Reduce(&tempErr, &error, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
+
+ if (rankx == 0 && ranky == 0)
+ {
+ if (error) { printf("\n[Error] Verification Failed!\n"); exit(-1); }
+
+ double gflops = (((2e-9)*N)*N)*N/execTime;
+ printf("Passed! \n");
+ printf("Execution time: %.3fs \n",execTime);
+ printf("GFlop/s: %.4f \n",gflops);
+ }
+}
+
diff --git a/hw2/solution/p2/auxiliar/auxiliar.hpp b/hw2/solution/p2/auxiliar/auxiliar.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb5d400cf4bf37ee0434bde13c2b3369464b8eb2
--- /dev/null
+++ b/hw2/solution/p2/auxiliar/auxiliar.hpp
@@ -0,0 +1,8 @@
+#ifndef _HPCSE2_AUXILIAR_
+#define _HPCSE2_AUXILIAR_
+#include
+#include
+void verifyMatMulResults(double* C, const int n, const int N, const int rankx, const int ranky, double execTime);
+void initializeMatrices(double* A, double* B, const int n, const int N, const int rankx, const int ranky);
+
+#endif
diff --git a/hw2/solution/p2/cannon.cpp b/hw2/solution/p2/cannon.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..f148349384752a0dff0aabca64957f42366a943e
--- /dev/null
+++ b/hw2/solution/p2/cannon.cpp
@@ -0,0 +1,114 @@
+#include
+#include
+#include
+#include
+#include
+#include "auxiliar/auxiliar.hpp"
+
+extern "C" void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc);
+
+int main(int argc, char* argv[])
+{
+ /**********************************************************
+ * Initial Setup
+ **********************************************************/
+ size_t N = 1024;
+ double one = 1.0;
+ double *A,*B,*C, *tmpA, *tmpB;
+ MPI_Request request[2];
+
+ int rank, size;
+ MPI_Init(&argc, &argv);
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ /**********************************************************
+ * Determining rank geometry < You need to improve this >
+ **********************************************************/
+
+ int p = sqrt(size);
+ if (size != p*p) { printf("[Error] Number of processors must be a square integer.\n"); MPI_Abort(MPI_COMM_WORLD, -1); }
+
+ int coords[2] = {0, 0};
+ int nums[2] = {p, p};
+ int periodic[2] = {true, true};
+
+ MPI_Comm cannonComm;
+ MPI_Cart_create(MPI_COMM_WORLD, 2, nums, periodic, true, &cannonComm);
+
+ int up, down, left, right;
+ MPI_Cart_shift(cannonComm, 0, 1, &left, &right);
+ MPI_Cart_shift(cannonComm, 1, 1, &up, &down);
+
+ MPI_Cart_coords(cannonComm, rank, 2, coords);
+ int rankx = coords[0];
+ int ranky = coords[1];
+
+ /********************************************
+ * Initializing Matrices
+ *******************************************/
+
+ // Determining local matrix side size (n)
+ const int n = N/p;
+
+ // Allocating space for local matrices A, B, C
+ A = (double*) calloc (n*n, sizeof(double));
+ B = (double*) calloc (n*n, sizeof(double));
+ C = (double*) calloc (n*n, sizeof(double));
+
+ // Allocating space for recieve buffers for A and B local matrices
+ tmpA = (double*) calloc (n*n, sizeof(double));
+ tmpB = (double*) calloc (n*n, sizeof(double));
+
+ // Initializing values for input matrices A and B
+ initializeMatrices(A, B, n, N, rankx, ranky);
+
+ /*******************************************************
+ * Creating Contiguous Datatype
+ ******************************************************/
+
+ MPI_Datatype subMatrixType;
+ MPI_Type_contiguous(n*n, MPI_DOUBLE, &subMatrixType);
+ MPI_Type_commit(&subMatrixType);
+
+ /**************************************************************
+ * Running Cannon's Matrix-Matrix Multiplication Algorithm
+ **************************************************************/
+
+ if (rank == 0) printf("Running Matrix-Matrix Multiplication...\n");
+
+ // Registering initial time. It's important to precede this with a barrier to make sure
+ // all ranks are ready to start when we take the time.
+ MPI_Barrier(MPI_COMM_WORLD);
+ double execTime = -MPI_Wtime();
+
+ dgemm_("N", "N", &n, &n, &n, &one, A, &n, B, &n, &one, C, &n);
+ for(int step =1; step < p; step++)
+ {
+ MPI_Irecv(tmpA, 1, subMatrixType, right, 0, cannonComm, &request[0]);
+ MPI_Irecv(tmpB, 1, subMatrixType, down, 1, cannonComm, &request[1]);
+
+ MPI_Send(A, 1, subMatrixType, left, 0, cannonComm);
+ MPI_Send(B, 1, subMatrixType, up, 1, cannonComm);
+
+ MPI_Waitall(2, request, MPI_STATUS_IGNORE);
+
+ double* holdA= A; A = tmpA; tmpA = holdA;
+ double* holdB= B; B = tmpB; tmpB = holdB;
+
+ dgemm_("N", "N", &n, &n, &n, &one, A, &n, B, &n, &one, C, &n);
+ }
+
+ // Registering final time. It's important to precede this with a barrier to make sure all ranks have finished before we take the time.
+ MPI_Barrier(MPI_COMM_WORLD);
+ execTime += MPI_Wtime();
+
+ /**************************************************************
+ * Verification Stage
+ **************************************************************/
+
+ verifyMatMulResults(C, n, N, rankx, ranky, execTime);
+ free(A); free(B); free(C); free(tmpA); free(tmpB);
+ MPI_Comm_free(&cannonComm);
+ return MPI_Finalize();
+}
diff --git a/hw2/solution/p3/particle.cpp b/hw2/solution/p3/particle.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..85840126f046207da3e1699acda57e7ac7ee7ce4
--- /dev/null
+++ b/hw2/solution/p3/particle.cpp
@@ -0,0 +1,88 @@
+#include
+#include
+#include
+
+
+struct particle
+{
+ int id;
+ double x[3];
+ bool state;
+ double gamma;
+};
+
+
+
+int main(int argc, char ** argv)
+{
+ MPI_Init(&argc, &argv);
+
+ int rank,size;
+ MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+ MPI_Comm_size(MPI_COMM_WORLD,&size);
+
+ particle p;
+
+ MPI_Datatype MPI_PARTICLE;
+ int count = 4;
+ int array_of_blocklengths [4] = {1,3,1,1};
+ MPI_Aint array_of_displacements [4];
+ MPI_Datatype array_of_types [4] = {MPI_INT,MPI_DOUBLE,MPI_C_BOOL,MPI_DOUBLE};
+
+
+ MPI_Aint base;
+ MPI_Aint id_address;
+ MPI_Aint x_address;
+ MPI_Aint state_address;
+ MPI_Aint gamma_address;
+
+ MPI_Get_address(&p,&base);
+ MPI_Get_address(&p.id,&id_address);
+ MPI_Get_address(&p.x,&x_address);
+ MPI_Get_address(&p.state,&state_address);
+ MPI_Get_address(&p.gamma,&gamma_address);
+
+ array_of_displacements[0] = id_address - base;
+ array_of_displacements[1] = x_address - base;
+ array_of_displacements[2] = state_address - base;
+ array_of_displacements[3] = gamma_address - base;
+
+
+ MPI_Type_create_struct(count,array_of_blocklengths,array_of_displacements,array_of_types,&MPI_PARTICLE);
+ MPI_Type_commit(&MPI_PARTICLE);
+
+
+
+ if (rank == 0)
+ {
+ p.id = 1;
+ p.x[0] = 3.14159265359;
+ p.x[1] = 2.71828182846;
+ p.x[2] = ( 1.0 + sqrt(5.0) ) / 2.0;
+ p.state = false;
+ p.gamma = 0.57721566490;
+
+ MPI_Send(&p,1,MPI_PARTICLE,1,100,MPI_COMM_WORLD);
+ }
+ else if (rank == 1)
+ {
+ MPI_Recv(&p,1,MPI_PARTICLE,0,100,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
+
+ printf("%d \n", p.id );
+ printf("%10.8f \n", p.x[0] );
+ printf("%10.8f \n", p.x[1] );
+ printf("%10.8f \n", p.x[2] );
+ printf("%d \n" , p.state );
+ printf("%10.8f \n", p.gamma );
+ }
+ else
+ {
+ printf("Run with exactly two ranks!\n");
+ int err = 1;
+ MPI_Abort(MPI_COMM_WORLD,err);
+ }
+ MPI_Type_free(&MPI_PARTICLE);
+
+ MPI_Finalize();
+ return 0;
+}