Commit 698b50bc authored by Nicolas Winkler's avatar Nicolas Winkler
Browse files

Merge branch 'euler/bench' of https://gitlab.ethz.ch/umarka/dphpc_project into euler/bench

parents 24cb42f6 e6ed1859
......@@ -14,7 +14,7 @@ CONFIG_NAME = "compile_run_config.json"
# set placeholder to projname
RESULTS_NAME = "{0}(_r[0-9]+)?-result-[0-9]+.txt"
NUM_RUNS = 10
NUM_RUNS = 20
def compile_all(base_build_dir: str, kokkos_dir: str):
......@@ -37,7 +37,9 @@ def run_all_single_node(base_build_dir: str):
f_config = open(os.path.join(base_build_dir, CONFIG_NAME))
config_data = json.load(f_config)
size_configs = ' '.join([' '.join([str(config["matrix-size"]), str(config["block-size"])])
for config in config_data["size-configs"] if config['matrix-size'] < 4096])
for config in config_data["size-configs"]])
# size_configs = ' '.join([' '.join([str(config["matrix-size"]), str(config["block-size"])])
# for config in config_data["size-configs"] if config['matrix-size'] < 4096])
for run_config in config_data["run-configs-single-node"]:
build_dir = os.path.join(base_build_dir, run_config["build-dir"])
print(
......
......@@ -7,6 +7,10 @@
{
"build-dir": "lu_kokkos_gpu",
"build-cmd": "make -j4"
},
{
"build-dir": "lu_cuda_blocked",
"build-cmd": "cmake ../. -DCMAKE_BUILD_TYPE=Release; make -j4"
}
],
......@@ -18,6 +22,10 @@
{
"build-dir": "lu_kokkos_gpu",
"run-cmd": "./lu_bench.cuda {1} {2}"
},
{
"build-dir": "lu_cuda_blocked",
"run-cmd": "./lu_cuda_blocked {1} {2}"
}
],
"bench-configs-single-node": [ 1 ],
......
# Kokkos minimally requires 3.10 right now,
# but your project can set it higher
cmake_minimum_required(VERSION 3.10)
# Project can mix languages - must have C++ support
# Kokkos flags are only applied to C++ files
project(Lu LANGUAGES CXX CUDA)
find_package(CUDA)
# add_executable(lu_kokkos_naive lu.cpp matrix_operations.cpp)
enable_language("CUDA")
add_executable(lu_cuda_blocked lu_bench.cpp matrix_util.cpp matrix_operations.cu)
#include <fstream>
#include <sys/types.h>
#include <dirent.h>
#include <iostream>
#include <ctime>
#include <iomanip>
#include "bench_util.h"
#ifdef _WIN32
#include <Windows.h>
#else
#include <unistd.h>
#endif
BenchUtil::BenchUtil(string projname)
{
BenchUtil::projname = projname;
}
unsigned int get_number_of_files(string prefix)
{
DIR *dp;
struct dirent *ep;
dp = opendir("./");
unsigned int cnt = 0;
if (dp != NULL)
{
while (ep = readdir(dp))
{
string filename(ep->d_name);
if (filename.find(prefix) != string::npos)
{
cnt++;
}
}
(void)closedir(dp);
}
else
perror("Couldn't open the directory");
return cnt;
}
unsigned int get_column_width(string col_name, string col_value)
{
return max(col_name.size(), col_value.size());
}
void BenchUtil::bench_finalize()
{
// output data to file
string output_filename = projname + "-result-";
unsigned int num_output_files = get_number_of_files(output_filename);
ofstream outputfile;
outputfile.open(output_filename + to_string(num_output_files) + ".txt", ios::out | ios::trunc);
printf("Outputting results to file '%s'\n", (output_filename + to_string(num_output_files) + ".txt").c_str());
if (outputfile.is_open())
{
char hostname[256];
if (gethostname(hostname, sizeof(hostname)) != -1)
{
outputfile << "# Node: " << hostname << endl;
}
time_t now = time(0);
char *dt = ctime(&now);
tm *gmtm = gmtime(&now);
outputfile << "# UTC date and time: " << asctime(gmtm);
outputfile << "# Local date and time: " << dt;
outputfile << "# Measurements are in microseconds" << endl;
// set column names
for (list<tuple<string, string>>::iterator param_it = parameters.begin()->second.begin(); param_it != parameters.begin()->second.end(); ++param_it)
{
outputfile << setw(get_column_width(get<0>(*param_it) + " ", get<1>(*param_it) + " ")) << right << " " + get<0>(*param_it);
}
map<unsigned int, list<tuple<string, double>>>::iterator measurement_it = measurements.begin();
auto id = measurement_it->first;
outputfile << setw(8) << right << " id";
for (list<tuple<string, double>>::iterator region_measurement = measurement_it->second.begin(); region_measurement != measurement_it->second.end(); ++region_measurement)
{
string region = " " + get<0>(*region_measurement);
outputfile << setw(max((int)region.size(), 20)) << right << region;
}
outputfile << endl;
// fill columns
for (map<unsigned int, list<tuple<string, double>>>::iterator it = measurements.begin(); it != measurements.end(); ++it)
{
unsigned int id = it->first;
for (list<tuple<string, string>>::iterator param_it = parameters[id].begin(); param_it != parameters[id].end(); ++param_it)
{
outputfile << setw(get_column_width(get<0>(*param_it) + " ", get<1>(*param_it) + " ")) << right << " " + get<1>(*param_it);
}
outputfile << setw(8) << right << id;
for (list<tuple<string, double>>::iterator measurement = it->second.begin(); measurement != it->second.end(); ++measurement)
{
string region = " " + get<0>(*measurement);
double duration_ns = get<1>(*measurement);
outputfile << setw(max((int)region.size(), 20)) << setprecision(6) << fixed << right << duration_ns / 1000;
}
outputfile << endl;
}
outputfile.close();
}
}
void BenchUtil::bench_start(unsigned int id, string region)
{
start_ns = chrono::high_resolution_clock::now().time_since_epoch().count();
curr_measurement_id = id;
curr_region = region;
}
double BenchUtil::bench_stop()
{
auto end_ns = chrono::high_resolution_clock::now().time_since_epoch().count();
double duration = end_ns - start_ns;
measurements[curr_measurement_id].push_back(tuple<string, double>(curr_region, duration));
return duration;
}
void BenchUtil::bench_param(unsigned int id, string key, string value)
{
parameters[id].push_back(tuple<string, string>(key, value));
}
/*
example usage:
int example(int argc, char *argv[])
{
BenchUtil benchUtil("test");
benchUtil.bench_param(0, "key", "val_0");
benchUtil.bench_start(0, "test_region_0");
sleep(1);
auto duration_ns = benchUtil.bench_stop();
printf("slept for %f ms\n", duration_ns / 1000000);
benchUtil.bench_start(0, "test_region_1");
sleep(1.5);
duration_ns = benchUtil.bench_stop();
printf("slept for %f ms\n", duration_ns / 1000000);
benchUtil.bench_param(1, "key", "val_1");
benchUtil.bench_start(1, "test_region_0");
sleep(2);
duration_ns = benchUtil.bench_stop();
printf("slept for %f ms\n", duration_ns / 1000000);
benchUtil.bench_start(1, "test_region_1");
sleep(1);
duration_ns = benchUtil.bench_stop();
printf("slept for %f ms\n", duration_ns / 1000000);
benchUtil.bench_finalize();
}
*/
\ No newline at end of file
#ifndef BENCH_UTIL_H
#define BENCH_UTIL_H
#include <map>
#include <list>
#include <tuple>
#include <chrono>
using namespace std;
class BenchUtil
{
private:
string projname;
double start_ns;
map<unsigned int, list<tuple<string, string>>> parameters;
map<unsigned int, list<tuple<string, double>>> measurements;
unsigned int curr_measurement_id;
string curr_region;
public:
/**
* @brief Construct a new Bench Util object
*
* @param projname Project name to be used in experiment output
*/
BenchUtil(string projname);
/// Clear state and output all measurement data to file in pretty format
void bench_finalize();
void bench_debug();
/**
* @brief Set the start time of a particular region of an experiment run.
*
* The implementation assumes for convenience that each id offers the same regions
*
* @param id identifier of the experiment run
* @param region name of the region being measured
*/
void bench_start(unsigned int id, string region);
/**
* @brief End the current measurement and store duration
*
* Note that nested regions are not supported in this implementation
*
* @return double measured execution time
*/
double bench_stop();
/// Set a parameter for the current experiment
/**
* @brief Set a parameter for the current experiment
*
* The implementation assumes for convenience that each id offers the same parameters
*
* @param id identifier of the experiment run
* @param key name of the parameter to store
* @param value value of the parameter to store
*/
void bench_param(unsigned int id, string key, string value);
};
#endif
\ No newline at end of file
......@@ -16,13 +16,8 @@ using Scalar = double;
using Scalar = float;
#endif
void blocked_lu(int n, Scalar* A);
const int test_size = 256 ;
const int block_size = 128;
/*
int main(int argc, char** argv)
{
//dgetrf2_(1, 2, 0, 1, 0, 0);
......@@ -38,10 +33,10 @@ int main(int argc, char** argv)
std::vector<Scalar> original = A;
std::cout << "Using data type: " << typeid(Scalar).name() << std::endl;
print_matrix(test_size, test_size, A.data());
// print_matrix(test_size, test_size, A.data());
const auto start = std::chrono::steady_clock::now();
blocked_lu(test_size, A.data());
wrapper_lu(test_size, A.data(), block_size);
const auto end = std::chrono::steady_clock::now();
std::cout << "decomposed a random " << test_size << "x" << test_size
......@@ -50,14 +45,10 @@ int main(int argc, char** argv)
//std::cout << "Decomp result: " << std::endl;
//print_matrix(test_size, test_size, A.data());
verify_matrix(test_size, test_size, A.data(), original.data());
print_matrix(test_size, test_size, A.data());
// print_matrix(test_size, test_size, A.data());
std::cout << std::endl;
//print_matrix(test_size, U.data());
}
void blocked_lu(int n, Scalar* A){
wrapper_lu(n, A);
}
*/
......@@ -23,6 +23,6 @@ extern void wrapper_mat_mult_minus(int n, int m, int p, const Scalar* A, const S
extern void wrapper_trsm(int n, int m, const Scalar* L, Scalar* A);
extern void wrapper_trans_trsm(int n, int m, const Scalar* L, Scalar* A);
*/
extern void wrapper_lu(int n, Scalar* A);
extern void wrapper_lu(int n, Scalar* A, int bs);
#endif // BLOCK_MPI_OMP_LU_H
#include <stdio.h>
#include <vector>
#include <omp.h>
//#include <Kokkos_Core.hpp>
#include "lu.cpp"
#include "bench_util.cpp"
#ifdef DOUBLE_PRECISION
using Scalar = double;
#else
using Scalar = float;
#endif
int test_size;
int block_size = 128;
int blocks_n;
extern void wrapper_lu(int N, Scalar* a, int bs);
/// Benchmark for OMP implementation
int bench_cuda(int run_id, BenchUtil &bench, int num_runs, int matrix_size)
{
for (int i = 0; i < num_runs; i++)
{
/* Variable declaration/allocation. */
Scalar *A;
bench.bench_param(run_id, "type", "CUDA");
bench.bench_param(run_id, "run", std::to_string(i));
bench.bench_param(run_id, "matrix size", std::to_string(matrix_size));
bench.bench_start(run_id, "execution_time");
A = (Scalar*)malloc(sizeof(Scalar)*matrix_size*matrix_size);
if (!A) {
printf("Memory allocation failed on host \n");
return -1;
}
randomize_matrix(matrix_size, A);
/* Perform the operation */
wrapper_lu(matrix_size, A, block_size);
/* Record and complete the current measure */
free(A);
bench.bench_stop();
++run_id;
}
return run_id;
}
/// call as: ./lu_bench.cpp num_threads num_runs matrix_size_0 block_size_0 matrix_size_1 block_size_1 ...
int main(int argc, char **argv)
{
// init
BenchUtil bench("lu_cuda");
if (argc < 3)
{
printf("incorrect number of args\n");
return -1;
}
int num_runs = stoi(argv[1]);
int configs[(argc - 2) / 2][2];
for (int i = 2, j = 0; i < argc; i += 2, j++)
{
configs[j][0] = stoi(argv[i]);
configs[j][1] = stoi(argv[i + 1]);
}
int run_id = 0;
/* Perform the measurements */
for (auto config : configs)
{
printf("Running measurements for img size %d x %d \n", config[0], config[1]);
fflush(stdout);
run_id = bench_cuda(run_id, bench, num_runs, config[0]);
if (run_id == -1){
printf("Failure \n");
return 1;
}
printf("done\n");
}
bench.bench_finalize();
return 0;
}
......@@ -7,10 +7,10 @@ OPTS := -std=c++11 -mavx
# /apps/ault/spack/opt/spack/linux-centos8-zen2/gcc-10.2.0/cuda-11.4.0-udyaakpt7oztg7gnj764dhkhdf5ory5d/
# /opt/cuda/
all: lu_block_cuda
all: lu_cuda_blocked
lu_block_cuda: lu.cpp matrix_util.cpp matrix_operations.o matrix_operations_seq.cpp
lu_cuda_blocked: lu.cpp matrix_util.cpp matrix_operations.o matrix_operations_seq.cpp
$(CXX) lu.cpp matrix_operations.o matrix_util.cpp matrix_operations_seq.cpp -O3 -fPIE -L/apps/ault/spack/opt/spack/linux-centos8-zen2/gcc-10.2.0/cuda-11.4.0-udyaakpt7oztg7gnj764dhkhdf5ory5d/lib64 -lcuda -lcudart $(OPTS) -o $@
matrix_operations.o: matrix_operations.cu
......
......@@ -16,8 +16,9 @@ using Scalar = double;
using Scalar = float;
#endif
//int test_size = 2000;
void blocked_lu(int n, Scalar* A);
int test_size = 4096;
/*
int main(int argc, char** argv)
{
......@@ -41,7 +42,7 @@ int main(int argc, char** argv)
std::cout << "decomposed a random " << test_size << "x" << test_size
<< " matrix in " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
verify_matrix(test_size, test_size, A.data(), original.data());
// verify_matrix(test_size, test_size, A.data(), original.data());
//print_matrix(test_size, A.data());
std::cout << std::endl;
......
......@@ -11,7 +11,6 @@ DOUBLE :=
all: lu_cuda
lu_cuda: lu_bench.cpp matrix_operations_global.o
$(CXX) -g ${DOUBLE} lu_bench.cpp matrix_operations_global.o -Og -L/apps/ault/spack/opt/spack/linux-centos8-zen2/gcc-10.2.0/cuda-11.4.0-udyaakpt7oztg7gnj764dhkhdf5ory5d/lib64 -lcuda -lcudart $(OPTS) -o $@
......
......@@ -18,8 +18,8 @@ using Scalar = float;
void blocked_lu(int n, Scalar* A);
const int test_size = 256 ;
const int block_size = 128;
int test_size = 256 ;
int block_size = 128;
......@@ -52,7 +52,7 @@ int main(int argc, char** argv)
verify_matrix(test_size, test_size, A.data(), original.data());
print_matrix(test_size, test_size, A.data());
// print_matrix(test_size, test_size, A.data());
std::cout << std::endl;
//print_matrix(test_size, U.data());
......
......@@ -12,10 +12,10 @@ DOUBLE :=
all: lu_cuda
lu_cuda: lu.cpp matrix_util.cpp matrix_operations.o matrix_operations_seq.cpp
lu_cuda: lu.cpp matrix_util.cpp matrix_operations_global.o matrix_operations_seq.cpp
$(CXX) ${DOUBLE} lu.cpp matrix_operations_global.o matrix_util.cpp matrix_operations_seq.cpp -O3 -L/apps/ault/spack/opt/spack/linux-centos8-zen2/gcc-10.2.0/cuda-11.4.0-udyaakpt7oztg7gnj764dhkhdf5ory5d/lib64 -lcuda -lcudart $(OPTS) -o $@
matrix_operations.o: matrix_operations.cu
matrix_operations_global.o: matrix_operations_global.cu
$(NVCC) ${DOUBLE} -c matrix_operations_global.cu
.PHONY: clean
......
......@@ -20,13 +20,13 @@ using Scalar = float;
#endif
__global__ void add( float *a, float *b, float *c) {
__global__ void add(Scalar *a, Scalar *b, Scalar *c) {
int tid = blockIdx.x; //Handle the data at the index
c[tid] = a[tid] + b[tid];
}
__global__ void scale(float *a, int size, int index){
__global__ void scale(Scalar *a, int size, int index){
int i;
int start=(index*size+index);
int end=(index*size+size);
......@@ -37,7 +37,7 @@ __global__ void scale(float *a, int size, int index){
}
__global__ void reduce(float *a, int size, int index, int b_size){
__global__ void reduce(Scalar *a, int size, int index, int b_size){
extern __shared__ float pivot[];
int i;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment