To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit d431e582 authored by ahuegli's avatar ahuegli

hw6 ex1 and ex3

parent ee4f84b4
......@@ -12,11 +12,21 @@ struct Pair {
__device__ Pair argMaxWarp(double a) {
// TODO: 1.b) Compute the argmax of the given value.
// Return the maximum and the location of the maximum (0..31).
int warpid = threadIdx.x & 31;
Pair result;
result.max = 0.0;
result.idx = 0;
result.max = a;
result.idx = warpid;
// ...
Pair tmp;
for(int i = 1; i < 32; i <<= 1){
tmp.max = __shfl_down_sync(0xffffffff, result.max, i, warpSize);
tmp.idx = __shfl_down_sync(0xffffffff, redult.idx, i, warpSize);
if(tmp.max > result.max && warpid+1 < 32){
result = tmp;
}
}
return result;
}
......@@ -37,6 +47,21 @@ __device__ Pair argMaxBlock(double a) {
return result;
}
__global__ void argMax1MKernel(const double *a, double *b, double *blockargMax, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double argMax = argMaxBlock(idx < N ? a[idx] : 0.0);
if (threadIdx.x == 0)
blockargMax[blockIdx.x] = argMax;
if(blockIdx.x == 0){
blockargMax[0] = argMaxBlock(argMax);
if(threadIdx.x == 0){
b[0] = argMax;
printf("b[0] = %f", argMax);
}
}
}
void argMax1M(const double *aDev, Pair *bDev, int N) {
assert(N <= 1024 * 1024);
......@@ -45,6 +70,12 @@ void argMax1M(const double *aDev, Pair *bDev, int N) {
// Hint: The solution requires more CUDA operations than just
// calling a single kernel. Feel free to use whatever you find
// necessary.
int threadsPerBlock = 1024;
int numBlocks = (N+threadsPerBlock-1) / threadsPerBlock;
printf("launching with <<<%i, %i>>>\n", numBlocks, threadsPerBlock);
double *blockargMax;
CUDA_CHECK(cudaMalloc(&blockargMax, numBlocks * sizeof(double)));
argMax1MKernel<<<numBlocks, threadsPerBlock>>>(aDev, bDev, blockargMax, N);
}
#include "reduction_argmax.h"
......
......@@ -10,7 +10,9 @@ __device__ double sumWarp(double a) {
// return the correct result.
// (although this function operates only on a single warp, it
// will be called with many threads for testing)
return 0.0;
for(int i = 1; i < 32; i <<= 1)
a += __shfl_down_sync(0xffffffff, a, i, warpSize);
return a;
}
/// Returns the sum of all values `a` within a block,
......@@ -19,7 +21,21 @@ __device__ double sumBlock(double a) {
// TODO: 1.c) Compute the sum of values `a` for all threads within a block.
// Only threadIdx.x == 0 has to return the correct result.
// NOTE: For 1.c) implement either this or `argMaxBlock`!
return 0.0;
a = sumWarp(a);
int warpId = threadIdx.x >> 5;
__syncthreads();
__shared__ double s[32];
if((threadIdx.x & 31) == 0){
s[warpId] = a;
}
__syncthreads();
if(threadIdx.x < warpSize){
a = sumWarp(s[threadIdx.x]);
}
return a;
}
/// Compute the sum of all values aDev[0]..aDev[N-1] for N <= 1024^2 and store the result to bDev[0].
......
......@@ -53,6 +53,16 @@ void runAsync(const char *kernelName, Kernel kernel, int N, int chunkSize, int n
aHost[i] = 10.0 * i;
// TODO 3.a) Allocate chunks and create streams.
int chunks = (N + chunkSize - 1) / chunkSize;
double *aDev;
double *bDev;
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, N * sizeof(double)));
std::vector<cudaStream_t> streams(numStreams);
for(int i = 0; i < numStreams; ++i){
CUDA_CHECK(cudaStreamCreate(&streams[i]));
}
// Instead of benchmark() we use a simplified measure() which invokes the
......@@ -64,8 +74,32 @@ void runAsync(const char *kernelName, Kernel kernel, int N, int chunkSize, int n
//
// Note: you can use CUDA_CHECK and CUDA_LAUNCH_EX from
// utils.h for error checking.
int threads = 1024;
int maxBlocks = 65'536;
int currStream = 0;
int blocks = (chunkSize + threads - 1) / threads;
for(int c = 0; c < chunks; ++c){
int base = c*chunkSize;
//(1) Upload `a`
CUDA_CHECK(cudaMemcpyAsync(aDev + base, aHost + base, std::min(chunkSize * sizeof(double), (N-base) * sizeof(double)), cudaMemcpyHostToDevice, streams[currStream]));
//(2) launch the kernel
for(int i = 0; i < blocks; i += maxBlocks){
int offset = base + i*threads;
CUDA_LAUNCH_EX(kernel, std::min(maxBlocks, blocks - i), threads, 0, streams[currStream], aDev + offset, bDev + offset, N);
}
CUDA_CHECK(cudaMemcpyAsync(bHost + base, bDev + base, std::min(chunkSize * sizeof(double), (N-base) * sizeof(double)),
cudaMemcpyDeviceToHost, streams[currStream]));
currStream = (currStream+1)%numStreams;
}
// TODO 3.b) Synchronize the streams.
for(int i = 0; i < numStreams; ++i){
CUDA_CHECK(cudaStreamSynchronize(streams[i]));
}
});
checkResults(bHost, N);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment