Commit fc85c0fd authored by carnet's avatar carnet
Browse files

LU implementations clean-up

parent a40adb2f
# Node: karim-System-Product-Name
# UTC date and time: Thu Dec 23 10:47:35 2021
# Local date and time: Thu Dec 23 10:47:35 2021
# Measurements are in microseconds
type run matrix_size id execution_time
GPU_Kokkos 0 512 0 26515.968000
GPU_Kokkos 1 512 1 27696.640000
GPU_Kokkos 2 512 2 27016.704000
GPU_Kokkos 3 512 3 27295.744000
GPU_Kokkos 4 512 4 21188.608000
GPU_Kokkos 5 512 5 21981.184000
GPU_Kokkos 6 512 6 21570.560000
GPU_Kokkos 7 512 7 21513.216000
GPU_Kokkos 8 512 8 20719.360000
GPU_Kokkos 9 512 9 21357.824000
GPU_Kokkos 10 512 10 21954.048000
GPU_Kokkos 11 512 11 21742.592000
GPU_Kokkos 12 512 12 20018.432000
GPU_Kokkos 13 512 13 20927.744000
GPU_Kokkos 14 512 14 21308.928000
GPU_Kokkos 15 512 15 21918.976000
GPU_Kokkos 16 512 16 21622.528000
GPU_Kokkos 17 512 17 19906.816000
GPU_Kokkos 18 512 18 21087.744000
GPU_Kokkos 19 512 19 21323.776000
GPU_Kokkos 0 1024 20 124157.440000
GPU_Kokkos 1 1024 21 123315.968000
GPU_Kokkos 2 1024 22 122346.496000
GPU_Kokkos 3 1024 23 123382.016000
GPU_Kokkos 4 1024 24 122657.280000
GPU_Kokkos 5 1024 25 122931.968000
GPU_Kokkos 6 1024 26 122963.968000
GPU_Kokkos 7 1024 27 123654.912000
GPU_Kokkos 8 1024 28 123560.704000
GPU_Kokkos 9 1024 29 126196.224000
GPU_Kokkos 10 1024 30 126064.640000
GPU_Kokkos 11 1024 31 123512.064000
GPU_Kokkos 12 1024 32 125294.336000
GPU_Kokkos 13 1024 33 123454.208000
GPU_Kokkos 14 1024 34 122553.088000
GPU_Kokkos 15 1024 35 123980.032000
GPU_Kokkos 16 1024 36 122219.008000
GPU_Kokkos 17 1024 37 123228.928000
GPU_Kokkos 18 1024 38 122706.944000
GPU_Kokkos 19 1024 39 123118.336000
GPU_Kokkos 0 2048 40 553269.504000
GPU_Kokkos 1 2048 41 551142.656000
GPU_Kokkos 2 2048 42 552811.008000
GPU_Kokkos 3 2048 43 551164.672000
GPU_Kokkos 4 2048 44 552062.720000
GPU_Kokkos 5 2048 45 552402.688000
GPU_Kokkos 6 2048 46 551868.928000
GPU_Kokkos 7 2048 47 552023.808000
GPU_Kokkos 8 2048 48 552028.928000
GPU_Kokkos 9 2048 49 552433.664000
GPU_Kokkos 10 2048 50 552578.816000
GPU_Kokkos 11 2048 51 550965.248000
GPU_Kokkos 12 2048 52 553518.336000
GPU_Kokkos 13 2048 53 551093.504000
GPU_Kokkos 14 2048 54 552593.152000
GPU_Kokkos 15 2048 55 553346.816000
GPU_Kokkos 16 2048 56 554440.960000
GPU_Kokkos 17 2048 57 554329.856000
GPU_Kokkos 18 2048 58 554860.032000
GPU_Kokkos 19 2048 59 554696.704000
GPU_Kokkos 0 4096 60 2496947.456000
GPU_Kokkos 1 4096 61 2499862.784000
GPU_Kokkos 2 4096 62 2493820.928000
GPU_Kokkos 3 4096 63 2498381.824000
GPU_Kokkos 4 4096 64 2494387.200000
GPU_Kokkos 5 4096 65 2494673.408000
GPU_Kokkos 6 4096 66 2500543.232000
GPU_Kokkos 7 4096 67 2510030.592000
GPU_Kokkos 8 4096 68 2502733.056000
GPU_Kokkos 9 4096 69 2507410.432000
GPU_Kokkos 10 4096 70 2505050.368000
GPU_Kokkos 11 4096 71 2505374.464000
GPU_Kokkos 12 4096 72 2503054.848000
GPU_Kokkos 13 4096 73 2510338.560000
GPU_Kokkos 14 4096 74 2506091.776000
GPU_Kokkos 15 4096 75 2507724.288000
GPU_Kokkos 16 4096 76 2502790.656000
GPU_Kokkos 17 4096 77 2511044.096000
GPU_Kokkos 18 4096 78 2504792.320000
GPU_Kokkos 19 4096 79 2505710.848000
# Node: karim-System-Product-Name
# UTC date and time: Thu Dec 23 10:47:50 2021
# Local date and time: Thu Dec 23 10:47:50 2021
# Measurements are in microseconds
type run matrix_size id execution_time
CUDA 0 512 0 109089.280000
CUDA 1 512 1 6350.592000
CUDA 2 512 2 4999.168000
CUDA 3 512 3 4339.968000
CUDA 4 512 4 3279.360000
CUDA 5 512 5 2836.224000
CUDA 6 512 6 2782.208000
CUDA 7 512 7 2865.152000
CUDA 8 512 8 3084.800000
CUDA 9 512 9 3272.704000
CUDA 10 512 10 3553.024000
CUDA 11 512 11 3214.848000
CUDA 12 512 12 3085.568000
CUDA 13 512 13 3084.800000
CUDA 14 512 14 3054.336000
CUDA 15 512 15 3684.096000
CUDA 16 512 16 3081.984000
CUDA 17 512 17 2945.024000
CUDA 18 512 18 3055.360000
CUDA 19 512 19 3099.392000
CUDA 0 1024 20 19790.592000
CUDA 1 1024 21 18706.688000
CUDA 2 1024 22 20468.480000
CUDA 3 1024 23 22644.480000
CUDA 4 1024 24 20789.760000
CUDA 5 1024 25 20819.968000
CUDA 6 1024 26 19899.136000
CUDA 7 1024 27 19925.504000
CUDA 8 1024 28 23503.616000
CUDA 9 1024 29 21276.416000
CUDA 10 1024 30 20332.288000
CUDA 11 1024 31 20904.960000
CUDA 12 1024 32 22030.848000
CUDA 13 1024 33 20324.864000
CUDA 14 1024 34 19805.696000
CUDA 15 1024 35 19712.256000
CUDA 16 1024 36 20812.800000
CUDA 17 1024 37 20640.000000
CUDA 18 1024 38 29102.848000
CUDA 19 1024 39 23834.880000
CUDA 0 2048 40 103647.744000
CUDA 1 2048 41 100470.528000
CUDA 2 2048 42 100770.304000
CUDA 3 2048 43 102636.032000
CUDA 4 2048 44 117466.880000
CUDA 5 2048 45 105467.648000
CUDA 6 2048 46 135898.880000
CUDA 7 2048 47 119165.440000
CUDA 8 2048 48 114978.304000
CUDA 9 2048 49 102383.616000
CUDA 10 2048 50 102063.360000
CUDA 11 2048 51 101471.232000
CUDA 12 2048 52 101383.168000
CUDA 13 2048 53 106080.512000
CUDA 14 2048 54 103068.928000
CUDA 15 2048 55 101213.184000
CUDA 16 2048 56 118488.320000
CUDA 17 2048 57 104392.192000
CUDA 18 2048 58 105839.104000
CUDA 19 2048 59 104246.528000
CUDA 0 4096 60 600614.656000
CUDA 1 4096 61 582457.856000
CUDA 2 4096 62 591685.632000
CUDA 3 4096 63 584352.256000
CUDA 4 4096 64 583844.608000
CUDA 5 4096 65 588819.200000
CUDA 6 4096 66 583927.040000
CUDA 7 4096 67 581686.016000
CUDA 8 4096 68 583331.840000
CUDA 9 4096 69 593907.200000
CUDA 10 4096 70 599530.752000
CUDA 11 4096 71 582878.464000
CUDA 12 4096 72 602914.304000
CUDA 13 4096 73 583299.584000
CUDA 14 4096 74 601964.288000
CUDA 15 4096 75 582338.304000
CUDA 16 4096 76 581960.192000
CUDA 17 4096 77 588377.600000
CUDA 18 4096 78 583119.360000
CUDA 19 4096 79 584276.480000
Blocked LU Decomposition using openmp and MPI
=============================================
Build
-----
To build, simply run `make` to run `mpirun lu_block_omp`.
For testing on a PC I recommend setting `export OMP_NUM_THREADS=1` in order for omp not to run multiple threads per thread and
run the program with `mpirun --oversubscribe -np 4 lu_block_omp` (replace 4 by your core count or what you want to test).
Open
make
mpirun --oversubscribe -np 4 lu_block_omp
make clean
#include <iostream>
#include <chrono>
#include <thread>
#include <vector>
#include <map>
#include <iterator>
#include <utility>
#include <omp.h>
#include <mpi.h>
#include <stdexcept>
#include "lu.h"
#include "matrix_util.h"
#include "matrix_operations.h"
// Configuration
const int matrix_size = 16;
const int block_size = 2;
const int blocks_n = (matrix_size + block_size - 1) / block_size;
// Assign blocks to a rank
int blockij_to_proc_id(int num_procs, int blocks_n, int i, int j)
{
return (i + j) % num_procs;
}
// distribute matrix A to all ranks in blocks and copy data to GPU memory
BlockMap distribute_matrix(const std::vector<double>& A, int id, int num_procs, MPI_Comm comm) {
BlockMap local_blocks;
MPI_Status status;
if (id == 0) {
std::vector<double> temp(block_size * block_size);
for (int i = 0; i < blocks_n; i++) {
for (int j = 0; j < blocks_n; j++) {
int bi = i * block_size;
int bj = j * block_size;
int bn = block_size;
int bm = block_size;
if (bi + bn > matrix_size) bn = matrix_size - bi;
if (bj + bm > matrix_size) bm = matrix_size - bj;
extract_submatrix(bi, bj, bn, bm, matrix_size, A.data(), temp.data());
int proc_id = blockij_to_proc_id(num_procs, blocks_n, i, j);
if (proc_id == id) {
// put received data into the GPU memory and store the pointer into the Map
int bytes = temp.size() * sizeof(double);
double* temp_dev = gpuMalloc(bytes);
HostToDevice(temp.data(), temp_dev, bytes);
local_blocks[{i, j}] = {
bi, bj, bn, bm,
temp_dev
};
}
else {
MPI_Send(temp.data(), bn * bm, MPI_DOUBLE, proc_id, 0, comm);
}
}
}
}
else {
for (int i = 0; i < blocks_n; i++) {
for (int j = 0; j < blocks_n; j++) {
int proc_id = blockij_to_proc_id(num_procs, blocks_n, i, j);
if (proc_id == id) {
int bi = i * block_size;
int bj = j * block_size;
int bn = block_size;
int bm = block_size;
if (bi + bn > matrix_size) bn = matrix_size - bi;
if (bj + bm > matrix_size) bm = matrix_size - bj;
std::vector<double> matrix_data(bn * bm);
MPI_Recv(matrix_data.data(), bn * bm, MPI_DOUBLE, 0, 0, comm, &status);
// put received data into the GPU memory and store the pointer into the Map
int bytes = matrix_data.size() * sizeof(double);
double* temp_dev = gpuMalloc(bytes);
HostToDevice(matrix_data.data(), temp_dev, bytes);
local_blocks[{i, j}] = {
bi, bj, bn, bm,
temp_dev
};
}
}
}
}
return local_blocks;
}
// for each block perform the necessary TRSM operations
void perform_trsm(BlockMap& local_blocks, int i, int id, int num_procs, MPI_Comm comm)
{
double* A00;
MPI_Status status;
// rank responsible for the LU-decomposition in the topleft corner
int first_lu_id = blockij_to_proc_id(num_procs, blocks_n, i, i);
if (id == first_lu_id) {
Block& block = local_blocks[{i, i}];
double* lu = block.data;
lu_simple(block_size, lu);
A00 = lu;
MPI_Bcast(A00, block_size * block_size, MPI_DOUBLE, first_lu_id, comm);
}
else {
A00 = gpuMalloc(block_size * block_size * sizeof(double));
MPI_Bcast(A00, block_size * block_size, MPI_DOUBLE, first_lu_id, comm);
}
double* tlu = A00;
for (int k = 0; k < block_size; k++) tlu[k * block_size + k] = 1.0;
BlockMap::iterator it = local_blocks.begin();
// Iterate over the map using Iterator till end.
while (it != local_blocks.end())
{
// Accessing KEY from element pointed by it.
int block_i = it->first.first;
int block_j = it->first.second;
if(block_j > i && block_i == i){
// block (i, block_j)
int bi = i * block_size;
int bj = block_j * block_size;
int bn = block_size;
int bm = block_size;
if (bi + bn > matrix_size) bn = matrix_size - bi;
if (bj + bm > matrix_size) bm = matrix_size - bj;
Block& block = local_blocks[{i, block_j}];
trsm(block_size, bm, tlu, block.data);
}
else if(block_i > i && block_j == i){
// block (block_i, i)
int bi = i * block_size;
int bj = block_i * block_size;
int bn = block_size;
int bm = block_size;
if (bi + bn > matrix_size) bn = matrix_size - bi;
if (bj + bm > matrix_size) bm = matrix_size - bj;
Block& block = local_blocks[{block_i, i}];
trans_trsm(block_size, bn, A00, block.data);
}
// Increment the Iterator to point to next entry
it++;
}
}
void perform_matmul(BlockMap& local_blocks, int i, int id, int num_procs, MPI_Comm comm)
{
MPI_Status status;
for (int ai = i + 1; ai < blocks_n; ai++) {
for (int aj = i + 1; aj < blocks_n; aj++) {
int sender_1 = blockij_to_proc_id(num_procs, blocks_n, i, aj);
int sender_2 = blockij_to_proc_id(num_procs, blocks_n, ai, i);
int receiver = blockij_to_proc_id(num_procs, blocks_n, ai, aj);
int bi = ai * block_size;
int bj = i * block_size;
int bn = block_size;
int bm = block_size;
if (bi + bn > matrix_size) bn = matrix_size - bi;
if (bj + bm > matrix_size) bm = matrix_size - bj;
if (receiver == id) {
double* matA = gpuMalloc(bn * bm * sizeof(double));
double* matB = gpuMalloc(bn * bm * sizeof(double));
if (sender_1 == id) {
Block& block = local_blocks[{i, aj}];
matA = block.data;
}
else {
MPI_Recv(matA, bn * bm, MPI_DOUBLE, sender_1, 0, comm, &status);
}
if (sender_2 == id) {
Block& block = local_blocks[{ai, i}];
matB = block.data;
}
else {
MPI_Recv(matB, bn * bm, MPI_DOUBLE, sender_2, 0, comm, &status);
}
Block& block = local_blocks[{ai, aj}];
mat_mult_minus(bn, bm, bn, matB, matA, block.data);
}
else {
if (sender_1 == id) {
Block& block = local_blocks[{i, aj}];
MPI_Send(block.data, bn * bm, MPI_DOUBLE, receiver, 0, comm);
}
if (sender_2 == id) {
Block& block = local_blocks[{ai, i}];
MPI_Send(block.data, bn * bm, MPI_DOUBLE, receiver, 0, comm);
}
}
}
}
}
std::vector<double> perform_gather(BlockMap& local_blocks, int id, int num_procs, MPI_Comm comm)
{
MPI_Status status;
std::vector<double> solution;
if (id == 0) {
solution = std::vector<double>(matrix_size * matrix_size);
}
// Gather solution
for (int i = 0; i < blocks_n; i++) {
for (int j = 0; j < blocks_n; j++) {
int bi = i * block_size;
int bj = j * block_size;
int bn = block_size;
int bm = block_size;
if (bi + bn > matrix_size) bn = matrix_size - bi;
if (bj + bm > matrix_size) bm = matrix_size - bj;
int proc_id = blockij_to_proc_id(num_procs, blocks_n, i, j);
if (id == 0 && proc_id != 0) {
double* matrix_data = gpuMalloc(bn * bm * sizeof(double));
MPI_Recv(matrix_data, bn * bm, MPI_DOUBLE, proc_id, 0, MPI_COMM_WORLD, &status);
insert_submatrix(bi, bj, bn, bm, matrix_size, solution.data(), matrix_data);
}
else if (id == 0) {
const Block& block = local_blocks[{i, j}];
std::vector<double> host_block_data(bn * bm);
DeviceToHost(block.data, host_block_data.data(), bn * bm * sizeof(double));
insert_submatrix(bi, bj, bn, bm, matrix_size, solution.data(), host_block_data.data());
}
else if (proc_id == id) {
if (local_blocks.find({i, j}) == local_blocks.end()) {
std::cout << "Error, block not local" << std::endl;
exit(-1);
}
const Block& block = local_blocks[{i, j}];
std::vector<double> host_block_data(bn * bm);
DeviceToHost(block.data, host_block_data.data(), bn * bm * sizeof(double));
//std::cout << "process " << id << " sending block " << i << ", " << j << std::endl;
MPI_Send(host_block_data.data(), bn * bm, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
}
}
}
return solution;
}
void load_counter(const BlockMap& local_blocks, int i, int* trsm_counter, int* matmul_counter){
// Iterate over the map using Iterator till end.
for (auto it = local_blocks.begin(); it != local_blocks.end(); it++)
{
int block_i = it->first.first;
int block_j = it->first.second;
if(block_i > i && block_j > i){
*matmul_counter = *matmul_counter + 1;
} else if (block_i == i && block_j > i){
*trsm_counter = *trsm_counter + 1;
} else if (block_i > i && block_j == i){
*trsm_counter = *trsm_counter + 1;
}
}
}
int main(int argc, char** argv)
{
int err;
int id, num_procs;
// initialize mpi status
err = MPI_Init(&argc, &argv);
err = MPI_Comm_rank(MPI_COMM_WORLD, &id);
err = MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
MPI_Status status;
// counters for load balancing
int trsm_counter = 0;
int matmul_counter = 0;
// map indices
std::vector<double> A;
std::vector<double> original;
// rank 0 is responsible for creating the original matrix
if(id == 0) {
A = std::vector<double>(matrix_size * matrix_size);
randomize_matrix(matrix_size, A.data());
original = A;
}
auto begin_moment = std::chrono::steady_clock::now();
BlockMap local_blocks = distribute_matrix(A, id, num_procs, MPI_COMM_WORLD);
//std::cout << "PASSED THROUGH DISTRIBUTION" << std::endl;
// run algorithm in "rounds"
for (int i = 0; i < blocks_n; i++) {
//std::cout << "Rank: " << id << " round " << i << std::endl;
load_counter(local_blocks, i, &trsm_counter, &matmul_counter);
//std::cout << "process " << id << " round " << i << std::endl;
perform_trsm(local_blocks, i, id, num_procs, MPI_COMM_WORLD);
//std::cout << "PASSED THROUGH TRSM" << std::endl;
perform_matmul(local_blocks, i, id, num_procs, MPI_COMM_WORLD);
//std::cout << "PASSED THROUGH DISTRIBUTED MATMUL" << std::endl;
//std::cout << "process " << id << " fin round " << i << std::endl;
}
// send all block to rank 0
std::vector<double> solution = perform_gather(local_blocks, id, num_procs, MPI_COMM_WORLD);
auto end_moment = std::chrono::steady_clock::now();
long ms_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_moment - begin_moment).count();
if (id == 0) {
std::vector<double> slu = A;
lu_simple(matrix_size, slu.data());
//print_matrix(matrix_size, matrix_size, solution.data());
verify_matrix(matrix_size, matrix_size, solution.data(), slu.data());
}
std::cout << "Rank: " << id << " finished in " << ms_duration << " ms (n = " << matrix_size << ")" << std::endl;
std::cout << "Rank: " << id << "; TRSM operations: " << trsm_counter << "; Mat Mul Operations: " << matmul_counter << std::endl;
err = MPI_Finalize();
return 0;
}
#pragma once
#ifndef BLOCK_MPI_OMP_LU_H
#define BLOCK_MPI_OMP_LU_H
struct Block {
int i, j;
int n, m;
/// matrix data in row major
double* data;
};
using BlockMap = std::map<std::pair<int, int>, Block>;
///
/// \brief performs the block distribution step for all MPI ranks
/// \returns a map containing all blocks local to this rank
///
BlockMap distribute_matrix(const std::vector<double>& A, std::map<std::pair<int, int>, Block> local_blocks, int id, int num_procs, MPI_Comm comm);
void perform_trsm(BlockMap& local_blocks, int i, int id, int num_procs, MPI_Comm comm);
void perform_matmul(BlockMap& local_blocks, int i, int id, int num_procs, MPI_Comm comm);
std::vector<double> perform_gather(BlockMap& local_blocks, int id, int num_procs, MPI_Comm comm);
void load_counter(const BlockMap& local_blocks, int i, int* trsm_counter, int* matmul_counter);
void blocked_lu(int n, double* A);
extern void HostToDevice(double* host, double* dev, int bytes);
extern void DeviceToHost(double* dev, double* host, int bytes);
extern double* gpuMalloc(int bytes);
extern void transpose(int n, int m, const double* A, double* A_trans);
extern void mat_mult_minus(int n, int m, int p, const double* A, const double* B, double* result);
extern void trsm(int n, int m, const double* L, double* A);
extern void trans_trsm(int n, int m, const double* L, double* A);
extern void lu_simple(int n, double* A);
#endif // BLOCK_MPI_OMP_LU_H
CC := gcc
CXX := g++
MPICXX := mpic++
NVCC := nvcc
OPTS := -std=c++11 -mavx
# /apps/ault/spack/opt/spack/linux-centos8-zen2/gcc-10.2.0/cuda-11.4.0-udyaakpt7oztg7gnj764dhkhdf5ory5d/
all: lu_block_mpi_cuda
lu_block_mpi_cuda: lu.cpp matrix_util.cpp matrix_operations.o
$(MPICXX) lu.cpp matrix_operations.o matrix_util.cpp -O3 -L/apps/ault/spack/opt/spack/linux-centos8-zen2/gcc-10.2.0/cuda-11.4.0-udyaakpt7oztg7gnj764dhkhdf5ory5d/lib64 -lcuda -lcudart $(OPTS) -o $@
matrix_operations.o: matrix_operations.cu
$(NVCC) -c matrix_operations.cu
.PHONY: clean
clean: