To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit b5f5b370 authored by ahuegli's avatar ahuegli

Merge branch 'master' of gitlab.ethz.ch:hpcse20/exercise

parents 9e899ac3 c821ebab
#!/usr/bin/env python
# File : grade.py
# Description: Generate grading submission file
# Copyright 2020 ETH Zurich. All Rights Reserved.
'''
Example:
python grade.py -q1 7 -c1 "This is why I scored 7 points" -q2 20 -q3 15 -c3 "Second comment here" -c3 "More comments"
'''
#Modify this, according to a given homework
exercise_conf = {
'Name' : 'Homework 6',
'Questions' : {
'Question 1': {'Total Points': 60},
'Question 2': {'Total Points': 40},
'Question 3': {'Total Points': 40},
}
}
'''
==========================================
Do not modify anything below this comment
==========================================
'''
import argparse
import datetime
import sys
def parse_args():
parser = argparse.ArgumentParser()
for i in range(1, len(exercise_conf['Questions'])+1, 1):
parser.add_argument('-q{:d}'.format(i),'--question{:d}'.format(i),
type=int, default=0,
help='Scored points for Question {:d}'.format(i))
parser.add_argument('-c{:d}'.format(i),'--comment{:d}'.format(i),
type=str, action='append', nargs='*',
help='Comments for Question {:d} (you can add multiple comments)'.format(i))
return vars(parser.parse_args())
if __name__ == "__main__":
args = parse_args()
grade = lambda s,m: 2.0 + (6.0-2.0) * float(s)/m
summary = {}
score = 0
maxpoints = 0
header = '{name:s}: {date:s}\n'.format(
name = exercise_conf['Name'], date = str(datetime.datetime.now()))
width = len(header.rstrip())
summary[0] = [header]
for i in range(1, len(exercise_conf['Questions'])+1, 1):
content = []
qscore = args['question{:d}'.format(i)]
qmax = exercise_conf['Questions']['Question {:d}'.format(i)]['Total Points']
qscore = max(0 , min(qscore, qmax))
content.append( 'Question {id:d}: {score:d}/{max:d}\n'.format(
id = i, score = qscore, max = qmax)
)
comments = args['comment{:d}'.format(i)]
if comments is not None:
for j,comment in enumerate([s for x in comments for s in x]):
content.append( ' -Comment {id:d}: {issue:s}\n'.format(
id = j+1, issue = comment.strip())
)
for line in content:
width = width if len(line.rstrip())<width else len(line.rstrip())
score += qscore
maxpoints += qmax
summary[i] = content
assert maxpoints > 0
with open('grade.txt', 'w') as out:
out.write(width*'*'+'\n')
for lines in summary.values():
for line in lines:
out.write(line)
out.write(width*'*'+'\n')
out.write('Grade: {:.2f}'.format(grade(score, maxpoints)))
reduction_sum
reduction_argmax
CUFLAGS=-O3 -std=c++14 --compiler-options "-Wall -Wextra -fopenmp"
.PHONY: all clean
all: reduction_sum reduction_argmax
@true
clean:
rm -f reduction_sum reduction_argmax
reduction_sum: reduction_sum.cu utils.h reduction_sum.h
nvcc $(CUFLAGS) $< -o $@
reduction_argmax: reduction_argmax.cu utils.h reduction_argmax.h
nvcc $(CUFLAGS) $< -o $@
#include "utils.h"
#include <cassert>
#include <limits>
struct Pair {
double max;
int idx;
};
/*
__device__ Pair shfl_xor_sync(Pair value, unsigned delta) {
return Pair{
__shfl_xor_sync(0xFFFFFFFF, value.max, delta),
__shfl_xor_sync(0xFFFFFFFF, value.idx, delta),
};
}
__device__ Pair argMaxOp(Pair a, Pair b) {
return a.max > b.max ? a : b;
}
__device__ Pair argMaxWarp(double a) {
Pair t{a, (int)threadIdx.x & 31};
t = argMaxOp(t, shfl_xor_sync(t, 1));
t = argMaxOp(t, shfl_xor_sync(t, 2));
t = argMaxOp(t, shfl_xor_sync(t, 4));
t = argMaxOp(t, shfl_xor_sync(t, 8));
t = argMaxOp(t, shfl_xor_sync(t, 16));
return t;
}
*/
/// Find the maximum value `a` among all warps and return {max value, index of
/// the max}. The result must be correct on at least the 0th thread of each warp.
__device__ Pair argMaxWarp(double a) {
double t = a; // max;
t = max(t, __shfl_xor_sync(0xFFFFFFFF, t, 1));
t = max(t, __shfl_xor_sync(0xFFFFFFFF, t, 2));
t = max(t, __shfl_xor_sync(0xFFFFFFFF, t, 4));
t = max(t, __shfl_xor_sync(0xFFFFFFFF, t, 8));
t = max(t, __shfl_xor_sync(0xFFFFFFFF, t, 16));
unsigned ballot = __ballot_sync(0xFFFFFFFF, a == t);
int idx = __ffs(ballot) - 1;
return {t, idx};
}
/// Second stage of argMaxBlock.
/// Returns {max value of a.value, thread index 0..1023 with the max value}.
__device__ Pair argMaxWarp(Pair a) {
Pair partial = argMaxWarp(a.max);
double maxBlock = partial.max;
int idxBlock = __shfl_sync(0xFFFFFFFF, a.idx, partial.idx);
return {maxBlock, idxBlock};
}
/// Returns the argmax of all values `a` within a block,
/// with the correct answer returned at least by the 0th thread of a block.
__device__ Pair argMaxBlock(double a) {
__shared__ Pair partials[32];
int warpIdx = threadIdx.x / warpSize;
Pair partial = argMaxWarp(a);
if (threadIdx.x % warpSize == 0) {
partials[warpIdx].max = partial.max;
partials[warpIdx].idx = partial.idx + threadIdx.x;
}
__syncthreads();
if (warpIdx == 0)
return argMaxWarp(partials[threadIdx.x]);
return Pair{0, 0};
}
/// Returns the argmax of all values `a` within a block,
/// with the correct answer returned only by the 0th thread of a block.
__device__ Pair argMaxBlock(Pair a) {
__shared__ Pair partials[32];
int warpIdx = threadIdx.x / warpSize;
Pair partial = argMaxWarp(a);
if (threadIdx.x % warpSize == 0)
partials[warpIdx] = partial;
__syncthreads();
if (warpIdx == 0)
return argMaxWarp(partials[threadIdx.x]);
return Pair{-1, -1};
}
__global__ void argMax1MKernel1(const double *a, Pair *tmp, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double value = idx < N ? a[idx] : 0.0;
Pair result = argMaxBlock(value);
if (threadIdx.x == 0) {
tmp[blockIdx.x].max = result.max;
tmp[blockIdx.x].idx = result.idx + blockIdx.x * blockDim.x;
}
}
__global__ void argMax1MKernel2(const Pair *tmp, Pair *b, int numBlocks) {
int idx = threadIdx.x;
Pair partial = idx < numBlocks ? tmp[idx] : Pair{-1e100, -1}; // -infty should be here.
Pair result = argMaxBlock(partial);
if (threadIdx.x == 0)
*b = result;
}
void argMax1M(const double *aDev, Pair *bDev, int N) {
assert(N <= 1024 * 1024);
int blocks = (N + 1023) / 1024;
Pair *tmpDev;
CUDA_CHECK(cudaMalloc(&tmpDev, blocks * sizeof(double)));
argMax1MKernel1<<<blocks, 1024>>>(aDev, tmpDev, N);
argMax1MKernel2<<<1, 1024>>>(tmpDev, bDev, blocks);
CUDA_CHECK(cudaFree(tmpDev));
}
#include "reduction_argmax.h"
int main() {
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 3);
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 32);
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 320);
testSmallArgMax(argMaxWarpTestKernel, argMaxWarpCheck, 32, 1023123);
printf("argMaxWarp OK.\n");
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 32);
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 1024);
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 12341);
testSmallArgMax(argMaxBlockTestKernel, argMaxBlockCheck, 1024, 1012311);
printf("argMaxBlock OK.\n");
testLargeArgMax("argMax1M", argMax1M, 32);
testLargeArgMax("argMax1M", argMax1M, 1024);
testLargeArgMax("argMax1M", argMax1M, 12341);
testLargeArgMax("argMax1M", argMax1M, 1012311);
printf("argMax1M OK.\n");
}
#include <algorithm>
#include <cstdio>
#include <numeric>
#include <random>
constexpr int kWarpSize = 32;
// Kernel for testing `argMaxWarp`. Do not edit.
__global__ void argMaxWarpTestKernel(const double *a, Pair *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// All threads of a warp must call `argMaxWarp`!
Pair argMax = argMaxWarp(idx < N ? a[idx] : 0.0);
if (threadIdx.x % warpSize == 0 && idx / warpSize < (N + warpSize - 1) / warpSize)
b[idx / warpSize] = argMax;
}
/// Check results of `argMaxWarp`. Do not edit.
void argMaxWarpCheck(int N, int K, const double *aHost, const Pair *bHost) {
for (int k = 0; k < K; ++k) {
int expectedIdx = std::max_element(aHost + k * kWarpSize, aHost + std::min((k + 1) * kWarpSize, N))
- (aHost + k * kWarpSize);
Pair expected{aHost[k * kWarpSize + expectedIdx], expectedIdx};
Pair received = bHost[k];
if (expected.idx != received.idx || expected.max != received.max) {
printf("argMaxWarp incorrect result: k=%d expected=%d %f received=%d %f\n",
k, expected.idx, expected.max, received.idx, received.max);
exit(1);
}
}
}
// Do not edit.
__global__ void argMaxBlockTestKernel(const double *a, Pair *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
Pair out = argMaxBlock(idx < N ? a[idx] : 0.0);
if (threadIdx.x == 0)
b[blockIdx.x] = out;
}
// Do not edit.
void argMaxBlockCheck(int N, int K, const double *aHost, const Pair *bHost) {
for (int k = 0; k < K; ++k) {
int expectedIdx = std::max_element(aHost + k * 1024, aHost + std::min((k + 1) * 1024, N))
- (aHost + k * 1024);
Pair expected{aHost[k * 1024 + expectedIdx], expectedIdx};
Pair received = bHost[k];
if (expected.idx != received.idx || expected.max != received.max) {
printf("argMaxBlock incorrect result: k=%d expected=%d %f received=%d %f\n",
k, expected.idx, expected.max, received.idx, received.max);
exit(1);
}
}
}
/// Test small argmax reductions (warp-level and block-level).
template <typename Kernel, typename CheckFunc>
void testSmallArgMax(Kernel kernel, CheckFunc checkFunc, int div, int N) {
int K = (N + div - 1) / div;
double *aHost;
Pair *bHost;
double *aDev;
Pair *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));
CUDA_CHECK(cudaMallocHost(&bHost, K * sizeof(Pair)));
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, K * sizeof(Pair)));
for (int i = 0; i < N; ++i) {
// aHost[i] = (long long)i * i % 12345;
aHost[i] = 10 * i;
}
std::mt19937 gen;
std::shuffle(aHost, aHost + N, gen);
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * sizeof(double), cudaMemcpyHostToDevice));
const int threads = 1024;
const int blocks = (N + threads - 1) / threads;
kernel<<<blocks, threads>>>(aDev, bDev, N);
CUDA_CHECK(cudaMemcpy(bHost, bDev, K * sizeof(Pair), cudaMemcpyDeviceToHost));
checkFunc(N, K, aHost, bHost);
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
/// Test large reductions (up to 1024^3 and larger).
template <typename Func>
void testLargeArgMax(const char *name, Func func, int N) {
double *aHost;
double *aDev;
Pair *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, 1 * sizeof(Pair)));
for (int i = 0; i < N; ++i) {
// aHost[i] = (N + 13241LL * i * i) % 432141;
aHost[i] = 10 * i;
}
std::mt19937 gen;
std::shuffle(aHost, aHost + N, gen);
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * sizeof(double), cudaMemcpyHostToDevice));
func(aDev, bDev, N);
int expectedIdx = std::max_element(aHost, aHost + N) - aHost;
Pair expected{aHost[expectedIdx], expectedIdx};
Pair received;
CUDA_CHECK(cudaMemcpy(&received, bDev, 1 * sizeof(Pair), cudaMemcpyDeviceToHost));
if (expected.idx != received.idx || expected.max != received.max) {
printf("large %s incorrect result: N=%d expected=%d %f received=%d %f\n",
name, N, expected.idx, expected.max, received.idx, received.max);
exit(1);
}
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(aHost));
}
#include "utils.h"
#include <cassert>
#include <algorithm>
/// Returns the sum of all values `a` within a warp,
/// with the correct answer returned only by the 0th thread of a warp.
__device__ double sumWarp(double a) {
a += __shfl_down_sync(0xFFFFFFFF, a, 1);
a += __shfl_down_sync(0xFFFFFFFF, a, 2);
a += __shfl_down_sync(0xFFFFFFFF, a, 4);
a += __shfl_down_sync(0xFFFFFFFF, a, 8);
a += __shfl_down_sync(0xFFFFFFFF, a, 16);
return a;
}
/// Returns the sum of all values `a` within a warp,
/// with the correct answer returned by all threads of a warp.
__device__ double sumWarpAll(double a) {
a += __shfl_xor_sync(0xFFFFFFFF, a, 1);
a += __shfl_xor_sync(0xFFFFFFFF, a, 2);
a += __shfl_xor_sync(0xFFFFFFFF, a, 4);
a += __shfl_xor_sync(0xFFFFFFFF, a, 8);
a += __shfl_xor_sync(0xFFFFFFFF, a, 16);
return a;
}
/// Returns the sum of all values `a` within a block,
/// with the correct answer returned only by the 0th thread of a block.
__device__ double sumBlock(double a) {
__shared__ double warpSums[32];
int warpIdx = threadIdx.x / warpSize;
double warpSum = sumWarp(a);
if (threadIdx.x % warpSize == 0)
warpSums[warpIdx] = warpSum;
__syncthreads();
double blockSum = 0;
if (warpIdx == 0)
blockSum = sumWarp(warpSums[threadIdx.x]);
return blockSum;
}
__global__ void sum1MKernel(const double *a, double *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double value = idx < N ? a[idx] : 0.0;
double sum = sumBlock(value);
if (threadIdx.x == 0)
b[blockIdx.x] = sum;
}
/// Compute the sum of all values aDev[0]..aDev[N-1] for N <= 1024^2 and store the result to bDev[0].
void sum1M(const double *aDev, double *bDev, int N) {
assert(N <= 1024 * 1024);
int blocks = (N + 1023) / 1024;
double *tmpDev;
CUDA_CHECK(cudaMalloc(&tmpDev, blocks * sizeof(double)));
sum1MKernel<<<blocks, 1024>>>(aDev, tmpDev, N);
sum1MKernel<<<1, 1024>>>(tmpDev, bDev, blocks);
CUDA_CHECK(cudaFree(tmpDev));
}
__global__ void sumVeryLargeKernel(const double *a, double *b, int N) {
double value = 0.0;
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < N; idx += blockDim.x * gridDim.x)
value += a[idx];
double sum = sumBlock(value);
if (threadIdx.x == 0)
b[blockIdx.x] = sum;
}
// Implementation of (1e) (note: it was not required to implement it).
void sumVeryLarge(const double *aDev, double *bDev, int N) {
int blocks = std::min(1024, (N + 1023) / 1024);
double *tmpDev;
CUDA_CHECK(cudaMalloc(&tmpDev, blocks * sizeof(double)));
sumVeryLargeKernel<<<blocks, 1024>>>(aDev, tmpDev, N);
sumVeryLargeKernel<<<1, 1024>>>(aDev, bDev, N);
CUDA_CHECK(cudaFree(tmpDev));
}
#include "reduction_sum.h"
int main() {
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 3);
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 32);
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 320);
testSmallSum(sumWarpTestKernel, sumWarpCheck, 32, 1023123);
printf("sumWarp OK.\n");
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 3);
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 32);
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 320);
testSmallSum(sumWarpAllTestKernel, sumWarpAllCheck, 1, 1023123);
printf("sumWarpAll OK.\n");
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 32);
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 1024);
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 12341);
testSmallSum(sumBlockTestKernel, sumBlockCheck, 1024, 1012311);
printf("sumBlock OK.\n");
testLargeSum("sum1M", sum1M, 32);
testLargeSum("sum1M", sum1M, 1024);
testLargeSum("sum1M", sum1M, 12341);
testLargeSum("sum1M", sum1M, 1012311);
printf("sum1M OK.\n");
testLargeSum("sumVeryLarge", sumVeryLarge, 32);
testLargeSum("sumVeryLarge", sumVeryLarge, 1024);
testLargeSum("sumVeryLarge", sumVeryLarge, 12341);
testLargeSum("sumVeryLarge", sumVeryLarge, 1012311);
testLargeSum("sumVeryLarge", sumVeryLarge, 1001002003);
printf("sumVeryLarge OK.\n");
}
#pragma once
#include "utils.h"
#include <algorithm>
#include <numeric>
constexpr int kWarpSize = 32;
/// Kernel for testing `sumWarp`. Do not edit.
__global__ void sumWarpTestKernel(const double *a, double *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// All threads of a warp must call `sumWarp`!
double sum = sumWarp(idx < N ? a[idx] : 0.0);
if (threadIdx.x % warpSize == 0 && idx / warpSize < (N + warpSize - 1) / warpSize)
b[idx / warpSize] = sum;
}
/// Check results of `sumWarp`. Do not edit.
inline void sumWarpCheck(int N, int K, const double *aHost, const double *bHost) {
for (int k = 0; k < K; ++k) {
double expected = std::accumulate(aHost + k * kWarpSize, aHost + std::min((k + 1) * kWarpSize, N), 0.0);
double received = bHost[k];
if (expected != received) {
printf("sumWarp incorrect result: k=%d expected=%f received=%f\n", k, expected, received);
exit(1);
}
}
}
/// Kernel for testing `sumWarpAll`. Do not edit.
__global__ void sumWarpAllTestKernel(const double *a, double *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// All threads of a warp must call `sumWarpAll`!
double sum = sumWarpAll(idx < N ? a[idx] : 0.0);
if (idx < N)
b[idx] = sum;
}
/// Check result of `sumWarpAll`. Do not edit.
inline void sumWarpAllCheck(int N, int K, const double *aHost, const double *bHost) {
for (int k = 0; k < K; k += kWarpSize) {
double expected = std::accumulate(aHost + k, aHost + std::min(k + kWarpSize, N), 0.0);
for (int j = k; j < std::min(k + kWarpSize, N); ++j) {
double received = bHost[j];
if (expected != received) {
printf("sumWarpAll incorrect result: k=%d j=%d expected=%f received=%f\n", k, j, expected, received);
exit(1);
}
}
}
}
// Do not edit.
__global__ void sumBlockTestKernel(const double *a, double *b, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double sum = sumBlock(idx < N ? a[idx] : 0.0);
if (threadIdx.x == 0)
b[blockIdx.x] = sum;
}
// Do not edit.
inline void sumBlockCheck(int N, int K, const double *aHost, const double *bHost) {
for (int k = 0; k < K; ++k) {
double expected = std::accumulate(aHost + k * 1024, aHost + std::min((k + 1) * 1024, N), 0.0);
double received = bHost[k];
if (expected != received) {
printf("sumBlock incorrect result: k=%d expected=%f received=%f\n", k, expected, received);
exit(1);
}
}
}
/// Test small reductions (warp-level and block-level).
template <typename Kernel, typename CheckFunc>
void testSmallSum(Kernel kernel, CheckFunc checkFunc, int div, int N) {
int K = (N + div - 1) / div;
double *aHost;
double *bHost;
double *aDev;
double *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));
CUDA_CHECK(cudaMallocHost(&bHost, K * sizeof(double)));
CUDA_CHECK(cudaMalloc(&aDev, N * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, K * sizeof(double)));
for (int i = 0; i < N; ++i)
aHost[i] = i;
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * sizeof(double), cudaMemcpyHostToDevice));
const int threads = 1024;
const int blocks = (N + threads - 1) / threads;
kernel<<<blocks, threads>>>(aDev, bDev, N);
CUDA_CHECK(cudaMemcpy(bHost, bDev, K * sizeof(double), cudaMemcpyDeviceToHost));
checkFunc(N, K, aHost, bHost);
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
/// Test large reductions (up to 1024^3 and larger).
template <typename Func>
void testLargeSum(const char *name, Func func, int N) {
double *aHost;
double *aDev;
double *bDev;
CUDA_CHECK(cudaMallocHost(&aHost, N * sizeof(double)));