To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit cbfd504d authored by kicici's avatar kicici

HW6 skeleton codes

parent c04e4d7a
reduction_sum
reduction_argmax
CUFLAGS=-O3 -std=c++14 --compiler-options "-Wall -Wextra -fopenmp"
.PHONY: all clean
all: reduction_sum reduction_argmax
@true
clean:
rm -f reduction_sum reduction_argmax
reduction_sum: reduction_sum.cu utils.h reduction_sum.h
nvcc $(CUFLAGS) $< -o $@
reduction_argmax: reduction_argmax.cu utils.h reduction_argmax.h
nvcc $(CUFLAGS) $< -o $@
#include "utils.h"
#include <cassert>
#include <limits>
struct Pair {
double max;
int idx;
};
/// Find the maximum value `a` among all warps and return {max value, index of
/// the max}. The result must be correct on at least the 0th thread of each warp.
__device__ Pair argMaxWarp(double a) {
// TODO: 1.b) Compute the argmax of the given value.
// Return the maximum and the location of the maximum (0..31).
Pair result;
result.max = 0.0;
result.idx = 0;
// ...
return result;
}
/// Returns the argmax of all values `a` within a block,
/// with the correct answer returned at least by the 0th thread of a block.
__device__ Pair argMaxBlock(double a) {
// TODO: 1.c) Compute the argmax of the given value.
// Return the maximum and the location of the maximum (0..1023).
// NOTE: For 1.c) implement either this or `sumBlock`!
Pair result;
result.max = 0.0;
result.idx = 0;
// ...
return result;
}
void argMax1M(const double *aDev, Pair *bDev, int N) {
assert(N <= 1024 * 1024);
// TODO: 1.d) Implement either this or `sum1M`.
// Avoid copying any data back to the host.
// Hint: The solution requires more CUDA operations than just
// calling a single kernel. Feel free to use whatever you find
// necessary.
}
#include "reduction_argmax.h"
int main() {
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 3);
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 32);
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 320);
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 1023123);
printf("argMaxWarp OK.\n");
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 32);
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 1024);
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 12341);
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 1012311);
printf("argMaxBlock OK.\n");
testLargeArgMax("argMax1M", argMax1M, 32);
testLargeArgMax("argMax1M", argMax1M, 1024);
testLargeArgMax("argMax1M", argMax1M, 12341);
testLargeArgMax("argMax1M", argMax1M, 1012311);
printf("argMax1M OK.\n");
}
#include <algorithm>
#include <numeric>
#include <cstdio>
constexpr int kWarpSize = 32;
// Kernel for testing `argMaxWarp`. Do not edit.
__global__ void argMaxWarpTestKernel(const double *a, Pair *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// All threads of a warp must call `argMaxWarp`!
Pair argMax = argMaxWarp(idx < N ? a[idx] : 0.0);
if (threadIdx.x % warpSize == 0 && idx / warpSize < (N + warpSize - 1) / warpSize)
b[idx / warpSize] = argMax;
}
/// Check results of `argMaxWarp`. Do not edit.
void argMaxWarpCheck(int N, int K, const double *aHost, const Pair *bHost) {
for (int k = 0; k < K; ++k) {
int expectedIdx = std::max_element(aHost + k * kWarpSize, aHost + std::min((k + 1) * kWarpSize, N))
- (aHost + k * kWarpSize);
Pair expected{aHost[k * kWarpSize + expectedIdx], expectedIdx};
Pair received = bHost[k];
if (expected.idx != received.idx || expected.max != received.max) {
printf("argMaxWarp incorrect result: k=%d expected=%d %f received=%d %f\n",
k, expected.idx, expected.max, received.idx, received.max);
exit(1);
}
}
}
// Do not edit.
__global__ void argMaxBlockTestKernel(const double *a, Pair *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
Pair out = argMaxBlock(idx < N ? a[idx] : 0.0);
if (threadIdx.x == 0)
b[blockIdx.x] = out;
}
// Do not edit.
void argMaxBlockCheck(int N, int K, const double *aHost, const Pair *bHost) {
for (int k = 0; k < K; ++k) {
int expectedIdx = std::max_element(aHost + k * 1024, aHost + std::min((k + 1) * 1024, N))
- (aHost + k * 1024);
Pair expected{aHost[k * 1024 + expectedIdx], expectedIdx};
Pair received = bHost[k];
if (expected.idx != received.idx || expected.max != received.max) {
printf("argMaxBlock incorrect result: k=%d expected=%d %f received=%d %f\n",
k, expected.idx, expected.max, received.idx, received.max);
exit(1);
}
}
}
/// Test small argmax reductions (warp-level and block-level).
template <typename Kernel, typename CheckFunc>
void testSmallArgMax(Kernel kernel, CheckFunc checkFunc, int div, int N) {
int K = (N + div - 1) / div;
double *aHost;
Pair *bHost;
double *aDev;
Pair *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));
CUDA_CHECK(cudaMallocHost(&bHost, K * sizeof(Pair)));
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, K * sizeof(Pair)));
for (int i = 0; i < N; ++i)
aHost[i] = (long long)i * i % 12345;
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * sizeof(double), cudaMemcpyHostToDevice));
const int threads = 1024;
const int blocks = (N + threads - 1) / threads;
kernel<<<blocks, threads>>>(aDev, bDev, N);
CUDA_CHECK(cudaMemcpy(bHost, bDev, K * sizeof(Pair), cudaMemcpyDeviceToHost));
checkFunc(N, K, aHost, bHost);
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
/// Test large reductions (up to 1024^3 and larger).
template <typename Func>
void testLargeArgMax(const char *name, Func func, int N) {
double *aHost;
double *aDev;
Pair *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, 1 * sizeof(Pair)));
for (int i = 0; i < N; ++i)
aHost[i] = (N + 13241LL * i * i) % 432141;
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * sizeof(double), cudaMemcpyHostToDevice));
func(aDev, bDev, N);
int expectedIdx = std::max_element(aHost, aHost + N) - aHost;
Pair expected{aHost[expectedIdx], expectedIdx};
Pair received;
CUDA_CHECK(cudaMemcpy(&received, bDev, 1 * sizeof(Pair), cudaMemcpyDeviceToHost));
if (expected.idx != received.idx || expected.max != received.max) {
printf("large %s incorrect result: N=%d expected=%d %f received=%d %f\n",
name, N, expected.idx, expected.max, received.idx, received.max);
exit(1);
}
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(aHost));
}
#include "utils.h"
#include <cassert>
#include <algorithm>
/// Returns the sum of all values `a` within a warp,
/// with the correct answer returned only by the 0th thread of a warp.
__device__ double sumWarp(double a) {
// TODO: 1.a) Compute sum of all values within a warp.
// Only the threads with threadIdx.x % warpSize == 0 have to
// return the correct result.
// (although this function operates only on a single warp, it
// will be called with many threads for testing)
return 0.0;
}
/// Returns the sum of all values `a` within a block,
/// with the correct answer returned only by the 0th thread of a block.
__device__ double sumBlock(double a) {
// TODO: 1.c) Compute the sum of values `a` for all threads within a block.
// Only threadIdx.x == 0 has to return the correct result.
// NOTE: For 1.c) implement either this or `argMaxBlock`!
return 0.0;
}
/// Compute the sum of all values aDev[0]..aDev[N-1] for N <= 1024^2 and store the result to bDev[0].
void sum1M(const double *aDev, double *bDev, int N) {
assert(N <= 1024 * 1024);
// TODO: 1.d) Implement either this or `argMax1M`.
// Avoid copying any data back to the host.
// Hint: The solution requires more CUDA operations than just
// calling a single kernel. Feel free to use whatever you find
// necessary.
}
#include "reduction_sum.h"
int main() {
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 3);
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 32);
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 320);
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 1023123);
printf("sumWarp OK.\n");
/*
// OPTIONAL: 1a reduce-all. In case you want to try to implement it,
// implement a global function `__device__ double sumWarpAll(double x)`,
// and comment out sumWarpAll* functions in utils.h.
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 3);
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 32);
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 320);
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 1023123);
printf("sumWarpAll OK.\n");
*/
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 32);
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 1024);
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 12341);
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 1012311);
printf("sumBlock OK.\n");
testLargeSum("sum1M", sum1M, 32);
testLargeSum("sum1M", sum1M, 1024);
testLargeSum("sum1M", sum1M, 12341);
testLargeSum("sum1M", sum1M, 1012311);
printf("sum1M OK.\n");
}
#pragma once
#include "utils.h"
#include <algorithm>
#include <numeric>
constexpr int kWarpSize = 32;
/// Kernel for testing `sumWarp`. Do not edit.
__global__ void sumWarpTestKernel(const double *a, double *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// All threads of a warp must call `sumWarp`!
double sum = sumWarp(idx < N ? a[idx] : 0.0);
if (threadIdx.x % warpSize == 0 && idx / warpSize < (N + warpSize - 1) / warpSize)
b[idx / warpSize] = sum;
}
/// Check results of `sumWarp`. Do not edit.
inline void sumWarpCheck(int N, int K, const double *aHost, const double *bHost) {
for (int k = 0; k < K; ++k) {
double expected = std::accumulate(aHost + k * kWarpSize, aHost + std::min((k + 1) * kWarpSize, N), 0.0);
double received = bHost[k];
if (expected != received) {
printf("sumWarp incorrect result: k=%d expected=%f received=%f\n", k, expected, received);
exit(1);
}
}
}
/*
/// Kernel for testing `sumWarpAll`. Do not edit.
__global__ void sumWarpAllTestKernel(const double *a, double *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// All threads of a warp must call `sumWarpAll`!
double sum = sumWarpAll(idx < N ? a[idx] : 0.0);
if (idx < N)
b[idx] = sum;
}
/// Check result of `sumWarpAll`. Do not edit.
inline void sumWarpAllCheck(int N, int K, const double *aHost, const double *bHost) {
for (int k = 0; k < K; k += kWarpSize) {
double expected = std::accumulate(aHost + k, aHost + std::min(k + kWarpSize, N), 0.0);
for (int j = k; j < std::min(k + kWarpSize, N); ++j) {
double received = bHost[j];
if (expected != received) {
printf("sumWarpAll incorrect result: k=%d j=%d expected=%f received=%f\n", k, j, expected, received);
exit(1);
}
}
}
}
*/
// Do not edit.
__global__ void sumBlockTestKernel(const double *a, double *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double sum = sumBlock(idx < N ? a[idx] : 0.0);
if (threadIdx.x == 0)
b[blockIdx.x] = sum;
}
// Do not edit.
inline void sumBlockCheck(int N, int K, const double *aHost, const double *bHost) {
for (int k = 0; k < K; ++k) {
double expected = std::accumulate(aHost + k * 1024, aHost + std::min((k + 1) * 1024, N), 0.0);
double received = bHost[k];
if (expected != received) {
printf("sumBlock incorrect result: k=%d expected=%f received=%f\n", k, expected, received);
exit(1);
}
}
}
/// Test small reductions (warp-level and block-level).
template <typename Kernel, typename CheckFunc>
void testSmallSum(Kernel kernel, CheckFunc checkFunc, int div, int N) {
int K = (N + div - 1) / div;
double *aHost;
double *bHost;
double *aDev;
double *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));
CUDA_CHECK(cudaMallocHost(&bHost, K * sizeof(double)));
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, K * sizeof(double)));
for (int i = 0; i < N; ++i)
aHost[i] = i;
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * sizeof(double), cudaMemcpyHostToDevice));
const int threads = 1024;
const int blocks = (N + threads - 1) / threads;
kernel<<<blocks, threads>>>(aDev, bDev, N);
CUDA_CHECK(cudaMemcpy(bHost, bDev, K * sizeof(double), cudaMemcpyDeviceToHost));
checkFunc(N, K, aHost, bHost);
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
/// Test large reductions (up to 1024^3 and larger).
template <typename Func>
void testLargeSum(const char *name, Func func, int N) {
double *aHost;
double *aDev;
double *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, 1 * sizeof(double)));
for (int i = 0; i < N; ++i)
aHost[i] = i % 12345678;
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * sizeof(double), cudaMemcpyHostToDevice));
func(aDev, bDev, N);
double expected = std::accumulate(aHost, aHost + N, 0.0);
double received;
CUDA_CHECK(cudaMemcpy(&received, bDev, 1 * sizeof(double), cudaMemcpyDeviceToHost));
if (expected != received) {
printf("large %s incorrect result: N=%d expected=%f received=%f\n", name, N, expected, received);
exit(1);
}
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(aHost));
}
#pragma once
#include <chrono>
#include <cstdio>
#include <cuda_runtime.h>
/*
* Benchmark the given function `N` times and return the average execution time.
*/
template <typename Func>
double benchmark(int N, Func func) {
for (int i = 0; i < N / 10 + 1; ++i)
func(); // Warmup.
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < N; ++i)
func();
cudaDeviceSynchronize();
auto t1 = std::chrono::steady_clock::now();
auto ns = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
double dt = 1e-9 * ns / N;
return dt;
}
/// Print the error message if a CUDA API execution failed.
void _cudaCheck(const char *file, int line, const char *func, cudaError_t code) {
if (code != cudaSuccess) {
fprintf(stderr, "CUDA Error at %s:%d in %s:%s\n",
file, line, func, cudaGetErrorString(code));
exit(1);
}
}
// Wrap every cuda API with this check.
#define CUDA_CHECK(cmd) _cudaCheck(__FILE__, __LINE__, __PRETTY_FUNCTION__, (cmd))
/// Replace
/// kernel<<<blocks, threads>>>(arg1, ...);
/// with
/// CUDA_LAUNCH(kernel, blocks, threads, arg1, ...);
#define CUDA_LAUNCH(kernel, blocks, threads, ...) do { \
cudaGetLastError(); \
kernel<<<(blocks), (threads)>>>(__VA_ARGS__); \
_cudaCheck(__FILE__, __LINE__, __PRETTY_FUNCTION__, cudaGetLastError()); \
} while (0)
*.d
*.o
*.orig
*.swo
*.swp
build
output.txt
cmake_minimum_required(VERSION 3.10)
if(POLICY CMP0074)
cmake_policy(SET CMP0074 NEW)
endif()
project(HW6_SSA LANGUAGES CXX CUDA)
find_package(CUDA 9.2 REQUIRED)
cuda_add_executable(
ssa
src/main.cpp
src/ssa.cu
src/kernels.cu
src/test.cu
src/utils.cpp)
target_compile_features(ssa PUBLIC cxx_std_14)
target_link_libraries(ssa curand)
# Choose RelWithDebInfo mode as default.
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
endif()
Compilation
===========
From the skeleton/p2/ folder, run
mkdir -p build
cd build
cmake ..
make -j8
Running
=======
From the build/ folder, run
./ssa
The code will generate a file called output.txt
Visualization
=============
From the build/ folder, run
../python/plot_output.py
You will need the `-Y` flag when connecting with ssh.
#!/usr/bin/env python3
import matplotlib.pyplot as plt
fname = 'output.txt'
if __name__ == '__main__':
print("Plotting results..")
time = []
sa = []
sb = []
with open(fname) as f:
for line in f:
t, a, b = line.split()
time.append(float(t))
sa.append(float(a))
sb.append(float(b))
plt.plot(time, sa, label='Sa')
plt.plot(time, sb, label='Sb')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Species Quantity')
plt.show()
#pragma once
#include <map>
#include <string>
class ArgumentParser
{
private:
std::map<std::string, std::string> args;
public:
int operator()(const std::string &key, int def) {
auto it = args.find(key);
if (it == args.end())
return def;
return std::stoi(it->second);
}
double operator()(const std::string &key, double def) {
auto it = args.find(key);
if (it == args.end())
return def;
return std::stod(it->second);
}
ArgumentParser(int argc, const char **argv) {
if (argc % 2 != 1)
throw std::invalid_argument("Expected an even number of arguments.");
for (int i = 1; i < argc; i += 2) {
if (argv[i][0] != '-')
throw std::invalid_argument("Flags should start with a -.");
args[argv[i]] = argv[i + 1];
}
}
};
#include "kernels.h"
/// Initialize reaction states.
__global__ void initializationKernel(
short Sa, short Sb, short *x,
float *t, int numSamples, int numIters)
{
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= numSamples)
return;
// Every sample starts with (Sa, Sb) at t=0.
t[idx] = 0.f;
x[0 * numIters * numSamples + idx] = Sa;
x[1 * numIters * numSamples + idx] = Sb;
}
/// Reaction simulation. This kernel uses precomputed random uniform samples
/// (from 0 to 1) to compute up to `numIters` steps of the SSA algorithm. The
/// values of Sa and Sb are stored in `x`, the time values in `t`. Buffer
/// `iters` stores the number of performed iterations, and `isSampleDone`
/// whether or not the sample has reach the final state (t >= endTime).
__global__ void dimerizationKernel(
int pass, const float *u,
short *x, float *t, int *iters, char *isSampleDone,
float endTime, int omega, int numIters, int numSamples)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int rngOffset = blockIdx.x * blockDim.x * 2 * numIters + threadIdx.x;
if (idx >= numSamples)
return;