To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit 882738f9 authored by ahuegli's avatar ahuegli

finished p1

parent 15945110
// TODO: (OPTIONAL) Implement `benchmark` in the utils file.
#include "utils.h"
#include <omp.h>
#include <chrono>
#define N 10000;
// TODO: Task 1a.1) create an empty kernel `emptyKernel`.
__global__ void emptyKernel(){}
/// Invoke `emptyKernel` with given number of blocks and threads/block and
......@@ -11,7 +15,7 @@ void invokeEmpty(bool synchronize, int numBlocks, int threadsPerBlock) {
// TODO: Benchmark invocation of the `emptyKernel` code with given number
// of blocks and threads/block.
double dt = 0.0; // Time per invocation in seconds.
double dt = benchmark(N, emptyKernel, numBlocks, threadsPerBlock, synchronize); // Time per invocation in seconds.
printf("synchronize=%d blocks=%5d threads/block=%4d iteration=%.1f us\n",
(int)synchronize, numBlocks, threadsPerBlock, 1e6 * dt);
};
......@@ -42,8 +46,22 @@ int main() {
static constexpr int numThreads = 12;
// TODO: Task 1a.4) Benchmark `emptyParallelRegion`.
for(int i = 0; i < (0.1*N)+1; ++i){
emptyParallelRegion(numThreads);
}
auto t0 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < N; ++i){
emptyParallelRegion(numThreads);
}
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> dt = t1 - t0;
printf("time of openmp = %d \n", dt);
double dt = 0.0; // Time per invocation in seconds.
//double dt = 0.0; // Time per invocation in seconds.
printf("Empty OpenMP parallel region with %d threads --> %.1f us\n",
numThreads, 1e6 * dt);
}
#include "utils.h"
#include <algorithm>
#include <random>
#include <chrono>
/// Buffer sizes we consider. The numbers are odd such that p[i]=(2*i)%K are all different.
static constexpr int kBufferSizes[] = {
......@@ -29,11 +30,13 @@ void subtask_b() {
// For example,
// CUDA_CHECK(cudaMalloc(...));
// CUDA_CHECK(cudaCmd) check whether `cudaCmd` completed successfully.
CUDA_CHECK(cudaMalloc(&aDev, maxK*sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, maxK*sizeof(double)));
CUDA_CHECK(cudaMalloc(&pDev, maxK*sizeof(int)));
CUDA_CHECK(cudaMallocHost(&aHost, maxK*sizeof(double)));
CUDA_CHECK(cudaMallocHost(&pHost, maxK*sizeof(int)));
// TODO: Delete this once done with allocation.
printf("Implement allocation first.\n");
return;
// Set aDev, bDev and aHost to 0.0 (not really that important).
......@@ -44,23 +47,38 @@ void subtask_b() {
// Task 1b.1)
for (int K : kBufferSizes) {
// TODO: Measure the execution time of synchronously uploading K doubles from the host to the device. Report GB/s
auto t0 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < N; ++i){
CUDA_CHECK(cudaMemcpy(aDev, aHost, K*sizeof(double), cudaMemcpyHostToDevice));
cudaDeviceSynchronize();
}
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::nano> dt = t1 - t0;
double gbps = 1e-9*K*sizeof(double)/(dt/double(N)); // Gigabytes per second here;
double gbps = 0.0; // Gigabytes per second here;
printf("upload K=%8d --> %5.2f GB/s\n", K, gbps);
}
// Task 1b.2)
bool synchronize = false;
/// Benchmark copying for a given access pattern (permutation).
auto benchmarkPermutedCopy = [=](const char *description, auto permutationFunc) {
for (int K : kBufferSizes) {
// Compute the permutation p[i].
permutationFunc(K);
const int numBlocks = K/threadsPerBlock;
/// TODO: Copy pHost to pDev. Don't forget CUDA_CHECK.
CUDA_CHECK(cudaMemcpy(pDev, pHost, K * sizeof(int), cudaMemcpyHostToDevice));
cudaDeviceSynchronize();
/// TODO: Benchmark the a_i = b_{p_i} kernel.
double dtABP = 0.0;
double dtABP = 1e-9 * benchmark(N, cpyBtoA, numBlocks, threadsPerBlock, synchronize, K, aDev, bDev, pDev);
/// TODO: (OPTIONAL) Benchmark the a_{p_i} = b_i kernel;
double dtAPB = 0.0;
......@@ -103,18 +121,25 @@ void subtask_b() {
// Task 1b.3) and 1b.4)
for (int K : kBufferSizes) {
// TODO: Benchmark a_i += b_i kernel.
double dt1 = 0.0;
const int numBlocks = K/threadsPerBlock;
double dt1 = 1e-9 * benchmark(1, aibi, numBlocks, threadsPerBlock, synchronize, K, aDev, bDev);
// TODO: Benchmark the kernel that repeats a_i += b_i 100x times.
double dt100 = 0.0;
double dt100 = 1e-9 * benchmark(100, aibi, numBlocks, threadsPerBlock, synchronize, K, aDev, bDev);
double gflops1 = 0.0;
double gflops100 = 0.0;
double gflops1 = 1e-9*K / dt1;
double gflops100 = 1e-9*K*100 / dt100;
printf("a+b 1x -> %4.1f GFLOP/s 100x -> %5.1f GFLOP/s\n", gflops1, gflops100);
}
// TODO: Free all host and all device buffers.
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(pDev));
CUDA_CHECK(cudaFreeHost(aHost));
CUDA_CHECK(cudaFreeHost(pHost));
}
int main() {
......
......@@ -10,6 +10,24 @@ __global__ void leibnizKernel(ll K, double *partialSums) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double sum = 0.0;
double sign;
ll iterPerBlock = K / gridDim.x;
ll iterPerThread = iterPerBlock / blockDim.x;
ll startIdx = idx * iterPerThread;
ll endIdx = min(K, startIdx + iterPerThread);
for(int i = startIdx; i < endIdx; ++i){
if(i%2)
sign = -1.0;
else
sign = 1.0
sum += sign / (2*i + 1);
}
// TODO: Compute the partial sum. Pick however you like which terms are computed by which thread.
// Avoid using std::pow for computing (-1)^k!
......@@ -26,14 +44,20 @@ void runCUDA(ll K, int numBlocks, int threadsPerBlock) {
double *partialSumsHost;
// TODO: Allocate the temporary buffers for partial sums.
cudaMallocHost(&partialSumsHost, numThreads * sizeof(double));
cudaMalloc(&partialSumsDev, numThreads * sizeof(double));
// TODO: Run the kernel and benchmark execution time.
double dt = 0.0;
double dt = 1e-9 * benchmark(1, leibnizKernel, numBlocks, threadsPerBlock, true, K, partialSumsDev);
// TODO: Copy the sumsDev to host and accumulate, and sum them up.
double sum = 0.0;
for(int i = 0; i < numThreads; ++i){
sum += partialSumsHost[i];
}
double pi = 4 * sum;
......@@ -42,6 +66,9 @@ void runCUDA(ll K, int numBlocks, int threadsPerBlock) {
1e-9 * K / dt);
// TODO: Deallocate cuda buffers.
cudaFree(partialSumsDev);
cudaFree(partialSumsHost);
}
/// Run the OpenMP variant of the code.
......@@ -49,10 +76,24 @@ void runOpenMP(ll K, int numThreads) {
double sum = 0.0;
// TODO: Implement the Leibniz series summation with OpenMP.
auto t0 = std::chrono::high_resolution_clock::now();
omp_set_num_threads(numThreads);
#pragma omp parallel for reduction(+ : sum)
for(int i = 0; i < K; ++i){
if(i%2)
sign = -1.0;
else
sign = 1.0;
sum += sign / (2*i + 1);
}
auto t1 = std::chrono::high_resolution_clock::now();
// TODO: Benchmark execution time.
double dt = 0.0;
std::chrono::duration<double, std::nano> dt = t1 - t0;
double pi = 4 * sum;
......
......@@ -5,13 +5,34 @@
#include <cuda_runtime.h>
template <typename Func>
double benchmark(int N, Func func) {
double dt = 0.0; // Time per invocation in seconds.
double benchmark(int N, Func func, const int numBlocks, const int threadsPerBlock, const bool synchronize) {
//double dt = 0.0; // Time per invocation in seconds.
for(int i = 0; i < (0.1*N)+1; ++i){
CUDA_LAUNCH(func, numBlocks, threadsPerBlock);
}
cudaDeviceSynchronize();
auto t0 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < N; ++i){
CUDA_LAUNCH(func, numBlocks, threadsPerBlock);
if(synchronize){
cudaDeviceSynchronize();
}
}
cudaDeviceSynchronize();
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> dt = t1 - t0;
// TODO: (OPTIONAL) Implement the measurement procedure
// here and use this function for all your measurements.
return dt;
return dt/double(N);
}
/// Print the error message if a CUDA API execution failed.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment