To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit bdd4b34f authored by ahuegli's avatar ahuegli

Merge branch 'master' of gitlab.ethz.ch:ahuegli/exercise

parents 35ac3cd7 e1468820
// TODO: (OPTIONAL) Implement `benchmark` in the utils file.
#include "utils.h"
#include <omp.h>
#include <chrono>
#define N 10000;
// TODO: Task 1a.1) create an empty kernel `emptyKernel`.
__global__ void emptyKernel(){}
/// Invoke `emptyKernel` with given number of blocks and threads/block and
......@@ -11,7 +15,7 @@ void invokeEmpty(bool synchronize, int numBlocks, int threadsPerBlock) {
// TODO: Benchmark invocation of the `emptyKernel` code with given number
// of blocks and threads/block.
double dt = 0.0; // Time per invocation in seconds.
double dt = benchmark(N, emptyKernel, numBlocks, threadsPerBlock, synchronize); // Time per invocation in seconds.
printf("synchronize=%d blocks=%5d threads/block=%4d iteration=%.1f us\n",
(int)synchronize, numBlocks, threadsPerBlock, 1e6 * dt);
};
......@@ -42,8 +46,22 @@ int main() {
static constexpr int numThreads = 12;
// TODO: Task 1a.4) Benchmark `emptyParallelRegion`.
for(int i = 0; i < (0.1*N)+1; ++i){
emptyParallelRegion(numThreads);
}
auto t0 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < N; ++i){
emptyParallelRegion(numThreads);
}
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> dt = t1 - t0;
printf("time of openmp = %d \n", dt);
double dt = 0.0; // Time per invocation in seconds.
//double dt = 0.0; // Time per invocation in seconds.
printf("Empty OpenMP parallel region with %d threads --> %.1f us\n",
numThreads, 1e6 * dt);
}
#include "utils.h"
#include <algorithm>
#include <random>
#include <chrono>
/// Buffer sizes we consider. The numbers are odd such that p[i]=(2*i)%K are all different.
static constexpr int kBufferSizes[] = {
......@@ -29,11 +30,13 @@ void subtask_b() {
// For example,
// CUDA_CHECK(cudaMalloc(...));
// CUDA_CHECK(cudaCmd) check whether `cudaCmd` completed successfully.
CUDA_CHECK(cudaMalloc(&aDev, maxK*sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, maxK*sizeof(double)));
CUDA_CHECK(cudaMalloc(&pDev, maxK*sizeof(int)));
CUDA_CHECK(cudaMallocHost(&aHost, maxK*sizeof(double)));
CUDA_CHECK(cudaMallocHost(&pHost, maxK*sizeof(int)));
// TODO: Delete this once done with allocation.
printf("Implement allocation first.\n");
return;
// Set aDev, bDev and aHost to 0.0 (not really that important).
......@@ -44,23 +47,38 @@ void subtask_b() {
// Task 1b.1)
for (int K : kBufferSizes) {
// TODO: Measure the execution time of synchronously uploading K doubles from the host to the device. Report GB/s
auto t0 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < N; ++i){
CUDA_CHECK(cudaMemcpy(aDev, aHost, K*sizeof(double), cudaMemcpyHostToDevice));
cudaDeviceSynchronize();
}
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::nano> dt = t1 - t0;
double gbps = 1e-9*K*sizeof(double)/(dt/double(N)); // Gigabytes per second here;
double gbps = 0.0; // Gigabytes per second here;
printf("upload K=%8d --> %5.2f GB/s\n", K, gbps);
}
// Task 1b.2)
bool synchronize = false;
/// Benchmark copying for a given access pattern (permutation).
auto benchmarkPermutedCopy = [=](const char *description, auto permutationFunc) {
for (int K : kBufferSizes) {
// Compute the permutation p[i].
permutationFunc(K);
const int numBlocks = K/threadsPerBlock;
/// TODO: Copy pHost to pDev. Don't forget CUDA_CHECK.
CUDA_CHECK(cudaMemcpy(pDev, pHost, K * sizeof(int), cudaMemcpyHostToDevice));
cudaDeviceSynchronize();
/// TODO: Benchmark the a_i = b_{p_i} kernel.
double dtABP = 0.0;
double dtABP = 1e-9 * benchmark(N, cpyBtoA, numBlocks, threadsPerBlock, synchronize, K, aDev, bDev, pDev);
/// TODO: (OPTIONAL) Benchmark the a_{p_i} = b_i kernel;
double dtAPB = 0.0;
......@@ -103,18 +121,25 @@ void subtask_b() {
// Task 1b.3) and 1b.4)
for (int K : kBufferSizes) {
// TODO: Benchmark a_i += b_i kernel.
double dt1 = 0.0;
const int numBlocks = K/threadsPerBlock;
double dt1 = 1e-9 * benchmark(1, aibi, numBlocks, threadsPerBlock, synchronize, K, aDev, bDev);
// TODO: Benchmark the kernel that repeats a_i += b_i 100x times.
double dt100 = 0.0;
double dt100 = 1e-9 * benchmark(100, aibi, numBlocks, threadsPerBlock, synchronize, K, aDev, bDev);
double gflops1 = 0.0;
double gflops100 = 0.0;
double gflops1 = 1e-9*K / dt1;
double gflops100 = 1e-9*K*100 / dt100;
printf("a+b 1x -> %4.1f GFLOP/s 100x -> %5.1f GFLOP/s\n", gflops1, gflops100);
}
// TODO: Free all host and all device buffers.
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(pDev));
CUDA_CHECK(cudaFreeHost(aHost));
CUDA_CHECK(cudaFreeHost(pHost));
}
int main() {
......
......@@ -10,6 +10,24 @@ __global__ void leibnizKernel(ll K, double *partialSums) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double sum = 0.0;
double sign;
ll iterPerBlock = K / gridDim.x;
ll iterPerThread = iterPerBlock / blockDim.x;
ll startIdx = idx * iterPerThread;
ll endIdx = min(K, startIdx + iterPerThread);
for(int i = startIdx; i < endIdx; ++i){
if(i%2)
sign = -1.0;
else
sign = 1.0
sum += sign / (2*i + 1);
}
// TODO: Compute the partial sum. Pick however you like which terms are computed by which thread.
// Avoid using std::pow for computing (-1)^k!
......@@ -26,14 +44,20 @@ void runCUDA(ll K, int numBlocks, int threadsPerBlock) {
double *partialSumsHost;
// TODO: Allocate the temporary buffers for partial sums.
cudaMallocHost(&partialSumsHost, numThreads * sizeof(double));
cudaMalloc(&partialSumsDev, numThreads * sizeof(double));
// TODO: Run the kernel and benchmark execution time.
double dt = 0.0;
double dt = 1e-9 * benchmark(1, leibnizKernel, numBlocks, threadsPerBlock, true, K, partialSumsDev);
// TODO: Copy the sumsDev to host and accumulate, and sum them up.
double sum = 0.0;
for(int i = 0; i < numThreads; ++i){
sum += partialSumsHost[i];
}
double pi = 4 * sum;
......@@ -42,6 +66,9 @@ void runCUDA(ll K, int numBlocks, int threadsPerBlock) {
1e-9 * K / dt);
// TODO: Deallocate cuda buffers.
cudaFree(partialSumsDev);
cudaFree(partialSumsHost);
}
/// Run the OpenMP variant of the code.
......@@ -49,10 +76,24 @@ void runOpenMP(ll K, int numThreads) {
double sum = 0.0;
// TODO: Implement the Leibniz series summation with OpenMP.
auto t0 = std::chrono::high_resolution_clock::now();
omp_set_num_threads(numThreads);
#pragma omp parallel for reduction(+ : sum)
for(int i = 0; i < K; ++i){
if(i%2)
sign = -1.0;
else
sign = 1.0;
sum += sign / (2*i + 1);
}
auto t1 = std::chrono::high_resolution_clock::now();
// TODO: Benchmark execution time.
double dt = 0.0;
std::chrono::duration<double, std::nano> dt = t1 - t0;
double pi = 4 * sum;
......
......@@ -5,13 +5,34 @@
#include <cuda_runtime.h>
template <typename Func>
double benchmark(int N, Func func) {
double dt = 0.0; // Time per invocation in seconds.
double benchmark(int N, Func func, const int numBlocks, const int threadsPerBlock, const bool synchronize) {
//double dt = 0.0; // Time per invocation in seconds.
for(int i = 0; i < (0.1*N)+1; ++i){
CUDA_LAUNCH(func, numBlocks, threadsPerBlock);
}
cudaDeviceSynchronize();
auto t0 = std::chrono::high_resolution_clock::now();
for(int i = 0; i < N; ++i){
CUDA_LAUNCH(func, numBlocks, threadsPerBlock);
if(synchronize){
cudaDeviceSynchronize();
}
}
cudaDeviceSynchronize();
auto t1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> dt = t1 - t0;
// TODO: (OPTIONAL) Implement the measurement procedure
// here and use this function for all your measurements.
return dt;
return dt/double(N);
}
/// Print the error message if a CUDA API execution failed.
......
......@@ -6,6 +6,21 @@ __global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
return;
// TODO: Copy the code from `nbody_0.cu` and fix the redundant memory accesses.
double3 f_tmp = double3{0.0, 0.0, 0.0};
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
// Instead of skipping the i == idx case, add 1e-150 to avoid division
// by zero. (dx * inv_r will be exactly 0.0)
double r = sqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_r = 1 / r;
f_tmp.x += dx * inv_r * inv_r * inv_r;
f_tmp.y += dy * inv_r * inv_r * inv_r;
f_tmp.z += dz * inv_r * inv_r * inv_r;
}
f[idx] = double3{f_tmp.x, f_tmp.y, f_tmp.z};
}
void computeForces(int N, const double3 *p, double3 *f) {
......
......@@ -6,6 +6,22 @@ __global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
return;
// TODO: Copy the code from `nbody_a.cu` and fix the reduntant arithmetic operations.
double3 f_tmp = double3{0.0, 0.0, 0.0};
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
// Instead of skipping the i == idx case, add 1e-150 to avoid division
// by zero. (dx * inv_r will be exactly 0.0)
double r = sqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_r = 1 / r;
double inv_r_tmp = inv_r * inv_r * inv_r;
f_tmp.x += dx * inv_r_tmp;
f_tmp.y += dy * inv_r_tmp;
f_tmp.z += dz * inv_r_tmp;
}
f[idx] = double3{f_tmp.x, f_tmp.y, f_tmp.z};
}
void computeForces(int N, const double3 *p, double3 *f) {
......
......@@ -6,6 +6,22 @@ __global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
return;
// TODO: Copy the code from `nbody_b.cu` and utilize rsqrt.
double3 f_tmp = double3{0.0, 0.0, 0.0};
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
// Instead of skipping the i == idx case, add 1e-150 to avoid division
// by zero. (dx * inv_r will be exactly 0.0)
double r = rsqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_r = 1 / r;
double inv_r_tmp = inv_r * inv_r * inv_r;
f_tmp.x += dx * inv_r_tmp;
f_tmp.y += dy * inv_r_tmp;
f_tmp.z += dz * inv_r_tmp;
}
f[idx] = double3{f_tmp.x, f_tmp.y, f_tmp.z};
}
void computeForces(int N, const double3 *p, double3 *f) {
......
......@@ -2,6 +2,41 @@
__global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
// TODO: Copy the code from `nbody_c.cu` and utilize shared memory.
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
if (idx >= N)
return;
extern __shared__ double3 sPos[];
double3 f_tmp = double3{0.0,0.0,0.0};
double3 particle = p[idx]
for(int j = 0; j < N; ++j){
sPos[tid] = p[j + tid];
__syncthreads();
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
// Instead of skipping the i == idx case, add 1e-150 to avoid division
// by zero. (dx * inv_r will be exactly 0.0)
double r = rsqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_r = 1 / r;
double inv_r_tmp = inv_r * inv_r * inv_r;
f_tmp.x += dx * inv_r_tmp;
f_tmp.y += dy * inv_r_tmp;
f_tmp.z += dz * inv_r_tmp;
}
__syncthreads();
}
f[idx].x += f_tmp.x;
f[idx].y += f_tmp.y;
f[idx].z += f_tmp.z;
}
void computeForces(int N, const double3 *p, double3 *f) {
......@@ -10,5 +45,6 @@ void computeForces(int N, const double3 *p, double3 *f) {
// TODO: Set the required shared memory size.
// Don't bother with checking errors here.
computeForcesKernel<<<numBlocks, numThreads>>>(N, p, f);
int sharedMem = numThreads * sizeof(double3);
computeForcesKernel<<<numBlocks, numThreads, sharedMem>>>(N, p, f);
}
......@@ -8,20 +8,44 @@
__global__ void jacobiStepKernel(int N, double hh, double invhh,
const double *rho, const double *phik, double *phik1) {
// TODO: Task 3b) Compute phik1[iy * N + ix]. Don't forget about 0 boundary conditions!
int ix = threadIdx.x + blockIdx.x*blockIdx.x;
int iy = threadIdx.y + blockIdx.y*blockIdx.y;
if(ix < N && iy < N){
double tmpA = phik[iy*N + (ix+1)] + phik[iy*N + (ix-1)] + phik[(iy+1)*N + ix] + phik[(iy-1)*N + ix];
phik1[iy * N +ix] = -0.25 * hh * ( - rho[iy*N + ix] - (invhh*tmpA));
}
}
void jacobiStep(int N, double h, const double *rhoDev, const double *phikDev, double *phik1Dev) {
/// TODO: Task 3b) Invoke the kernel jacobiSetKernel. Consider using dim3 as number
/// of blocks and number of threads!
dim3 numBlocks(N/32, N/32);
dim3 numThreads(32, 32);
double hh = h*h;
jacobiStepKernel<<<numBlocks, numThreads>>>(N, hh, 1.0/hh, rhoDev, phikDev, phik1Dev);
}
__global__ void computeAphiKernel(int N, double invhh, const double *phi, double *Aphi) {
// TODO: Task 3d) Compute Aphi[iy * N + ix]. Don't forget about 0 boundary conditions!
int ix = threadIdx.x + blockIdx.x*blockIdx.x;
int iy = threadIdx.y + blockIdx.y*blockIdx.y;
if(ix < N && iy < N){
double tmpA = phi[iy*N + (ix+1)] + phi[iy*N + (ix-1)] + phi[(iy+1)*N + ix] + phi[(iy-1)*N + ix] - 4*phi[iy*N + ix];
Aphi[iy * N +ix] = invhh * tmpA;
}
}
void computeAphi(int N, double h, const double *xDev, double *AphiDev) {
/// TODO: Task 3d) Invoke the kernel `computeAphiKernel`.
dim3 numBlocks(N/32, N/32);
dim3 numThreads(32, 32);
computeAphiKernel<<<numBlocks, numThreads>>>(N, 1.0/(h*h), xDev, AphiDev);
}
// Print L1 and L2 error. Do not edit!
......@@ -74,10 +98,18 @@ int main() {
// TODO: Task 3b) Allocate buffers of N^2 doubles.
// (You might need additional temporary buffers to complete all tasks.)
cudaMalloc(&rhoDev, N2*sizeof(double));
cudaMalloc(&phikDev, N2*sizeof(double));
cudaMalloc(&phik1Dev, N2*sizeof(double));
cudaMalloc(&AphiDev, N2*sizeof(double));
cudaMallocHost(&rhoHost, N2*sizeof(double));
cudaMallocHost(&AphiHost, N2*sizeof(double));
cudaMallocHost(&xHost, N2*sizeof(double));
// TODO: Task 3b) Remove this.
printf("Implement allocation first\n");
return;
// // TODO: Task 3b) Remove this.
// printf("Implement allocation first\n");
// return;
// RHS with three non-zero elements at (x, y)=(0.3, 0.1), (0.4, 0.1) and (0.5, 0.6).
for (int i = 0; i < N * N; ++i)
......@@ -87,14 +119,32 @@ int main() {
rhoHost[(6 * N / 10) * N + (5 * N / 10)] = -2.0;
// TODO: Task 3b) Upload rhoHost to rhoDev.
cudaMemcpy(rhoDev, rhoHost, N2*sizeof(double), cudaMemcpyHostToDevice);
// Initial guess x^(0)_i = 0.
// TODO: Task 3b) Memset phikDev to 0.
cudaMemset(phikDev, 0, N2*sizeof(double));
// TODO: Task 3c) Run the jacobiStep numIterations times.
// TODO: Task 3d) Call computeAphi, download the result and call printL1L2.
// Ensure that L1 and L2 drop over time.
for(int i = 0; i < numIterations; ++i){
jacobiStep(N, h, rhoDev, phikDev, phik1Dev);
computeAphi(N, h, phikDev, AphiDev);
cudaMemcpy(AphiHost, AphiDev, N2*sizeof(double), cudaMemcpyDeviceToHost);
}
// TODO: (OPTIONAL) Download the vector phik1 (or phik) and call dumpCSV for visualization.
jacobiStep(N, h, rhoDev, phikDev, phik1Dev);
//jacobiStep(N, h, rhoDev, phikDev, phik1Dev);
// TODO: Task 3a) Deallocate buffers.
cudaFree(rhoDev);
cudaFree(phikDev);
cudaFree(phik1Dev);
cudaFree(rhoHost);
cudaFree(AphiDev);
cudaFree(AphiHost);
cudaFree(xHost);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment