To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit bbee2bcb authored by kicici's avatar kicici

HW5 solution codes

parent 4d7fcc1b
benchmarks_a
benchmarks_b
benchmarks_c
benchmarks_d
CUFLAGS=-O3 -std=c++14 --compiler-options "-Wall -Wextra -fopenmp"
.PHONY: all clean
all: benchmarks_a benchmarks_b benchmarks_c
@true
clean:
rm -f benchmarks_a benchmarks_b benchmarks_c
%: %.cu utils.h
nvcc $(CUFLAGS) $< -o $@
#include "utils.h"
#include <omp.h>
__global__ void emptyKernel() {
}
/// Invoke `emptyKernel` with given number of blocks and threads/block and
/// report the execution time.
void invokeEmpty(bool synchronize, int numBlocks, int threadsPerBlock) {
double dt = benchmark(100'000, [=]() {
// emptyKernel<<<numBlocks, threadsPerBlock>>>();
CUDA_LAUNCH(emptyKernel, numBlocks, threadsPerBlock);
if (synchronize)
CUDA_CHECK(cudaDeviceSynchronize());
});
printf("synchronize=%d blocks=%5d threads/block=%4d iteration=%.1f us\n",
(int)synchronize, numBlocks, threadsPerBlock, 1e6 * dt);
};
/// Run an empty parallel region with `numThreads` threads.
void emptyParallelRegion(int numThreads) {
#pragma omp parallel num_threads(numThreads)
{
// With this command we prevent the compiler from optimizing away the
// whole parallel region.
__asm__ volatile("");
}
}
int main() {
invokeEmpty(false, 1, 1); // Task 1a) #1
invokeEmpty(true, 1, 1); // Task 1a) #2
invokeEmpty(true, 1, 32); // Task 1a) #3
invokeEmpty(true, 1, 1024);
invokeEmpty(true, 32, 1024);
invokeEmpty(true, 1024, 32);
invokeEmpty(true, 32768, 1);
invokeEmpty(true, 32768, 32);
invokeEmpty(true, 32768, 1024);
static constexpr int numThreads = 12;
double dt = benchmark(100'000, []() {
emptyParallelRegion(numThreads);
});
printf("Empty OpenMP parallel region with %d threads --> %.1f us\n",
numThreads, 1e6 * dt);
}
#include "utils.h"
#include <algorithm>
#include <random>
/// a_i <-- b_{p_i}
__global__ void permutedCopyKernelABP(int K, double *a, const double *b, const int *p) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < K)
a[idx] = b[p[idx]];
}
/// a_{p_i} <-- b_i
__global__ void permutedCopyKernelAPB(int K, double *a, const double *b, const int *p) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < K)
a[p[idx]] = b[idx];
}
/// a_i <-- a_i + b_i
__global__ void additionKernel(int K, double *a, const double *b) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < K)
a[idx] += b[idx];
}
/// a_i <-- a_i + b_i (100x)
__global__ void repeatedAdditionKernel(int K, double *a, const double *b) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < K) {
for (int i = 0; i < 100; ++i)
a[idx] += b[idx];
}
}
/// a_i <-- a_i + b_i (100x) (optimized with restrict)
__global__ void optimizedRepeatedAdditionKernel1(
int K, double * __restrict__ a, const double * __restrict__ b) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < K) {
for (int i = 0; i < 100; ++i)
a[idx] += b[idx];
}
}
/// a_i <-- a_i + b_i (100x) (optimized with a temporary variable)
__global__ void optimizedRepeatedAdditionKernel2(int K, double *a, const double *b) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < K) {
double aFinal = a[idx];
for (int i = 0; i < 100; ++i)
aFinal += b[idx];
a[idx] = aFinal;
}
}
/// Buffer sizes we consider. The numbers are odd such that p[i]=(2*i)%K are all different.
static constexpr int kBufferSizes[] = {
17, 65, 251, 1001, 2001, 5001,
10'001, 25'001, 50'001, 100'001, 250'001, 500'001, 1'000'001,
5'000'001, 20'000'001, 50'000'001,
};
void subtask_b() {
constexpr int threadsPerBlock = 1024;
int maxK = kBufferSizes[sizeof(kBufferSizes) / sizeof(kBufferSizes[0]) - 1];
/// Pick a N with respect to K such that total running time is more or less uniform.
auto pickN = [](int K) {
return 100'000 / (int)std::sqrt(K) + 5; // Some heuristics.
};
// Allocate all neccessary device and host buffers.
// CUDA_CHECK(cudaCmd) check whether `cudaCmd` completed successfully.
double *aDev;
double *bDev;
int *pDev;
double *aHost;
int *pHost;
CUDA_CHECK(cudaMalloc(&aDev, maxK * sizeof(double)));
CUDA_CHECK(cudaMalloc(&bDev, maxK * sizeof(double)));
CUDA_CHECK(cudaMalloc(&pDev, maxK * sizeof(int)));
CUDA_CHECK(cudaMallocHost(&aHost, maxK * sizeof(double)));
CUDA_CHECK(cudaMallocHost(&pHost, maxK * sizeof(int)));
// Set aDev, bDev and aHost to 0.0 (not really that important).
CUDA_CHECK(cudaMemset(aDev, 0, maxK * sizeof(double)));
CUDA_CHECK(cudaMemset(bDev, 0, maxK * sizeof(double)));
memset(aHost, 0, maxK * sizeof(double));
// Task 1b.1)
for (int K : kBufferSizes) {
double dt = benchmark(pickN(K), [=]() {
// Synchronous upload.
CUDA_CHECK(cudaMemcpy(aDev, aHost, K * sizeof(double), cudaMemcpyHostToDevice));
});
printf("upload K=%8d --> %5.2f GB/s\n", K, 1e-9 * K * sizeof(double) / dt);
}
// Task 1b.2)
/// Benchmark copying for a given access pattern (permutation).
auto benchmarkPermutedCopy = [=](const char *description, auto permutationFunc) {
for (int K : kBufferSizes) {
// Compute the permutation p[i].
permutationFunc(K);
CUDA_CHECK(cudaMemcpy(pDev, pHost, K * sizeof(int), cudaMemcpyHostToDevice));
int numBlocks = (K + threadsPerBlock - 1) / threadsPerBlock;
// Test both variants of permuted copying.
double dtABP = benchmark(pickN(K), [=]() {
// permutedCopyKernelABP<<<numBlocks, threadsPerBlock>>>(K, aDev, bDev, pDev);
CUDA_LAUNCH(permutedCopyKernelABP, numBlocks, threadsPerBlock, K, aDev, bDev, pDev);
});
double dtAPB = benchmark(pickN(K), [=]() {
// permutedCopyKernelAPB<<<numBlocks, threadsPerBlock>>>(K, aDev, bDev, pDev);
CUDA_LAUNCH(permutedCopyKernelAPB, numBlocks, threadsPerBlock, K, aDev, bDev, pDev);
});
// Report how many bytes per second was written.
printf("Case %s --> K=%8d [a=b_p] %6.2f GB/s [a_p=b] %6.2f GB/s written\n",
description, K,
1e-9 * K * sizeof(double) / dtABP,
1e-9 * K * sizeof(double) / dtAPB);
}
};
// The patterns are already implemented, do not modify!
std::mt19937 gen;
benchmarkPermutedCopy("p[i]=i", [pHost](int K) {
for (int i = 0; i < K; ++i)
pHost[i] = i;
});
benchmarkPermutedCopy("p[i]=(2*i)%K", [pHost](int K) {
for (int i = 0; i < K; ++i)
pHost[i] = (2 * i) % K;
});
benchmarkPermutedCopy("p[i]=(4*i)%K", [pHost](int K) {
for (int i = 0; i < K; ++i)
pHost[i] = (4 * i) % K;
});
benchmarkPermutedCopy("p[i]=i, 32-shuffled", [pHost, &gen](int K) {
for (int i = 0; i < K; ++i)
pHost[i] = i;
for (int i = 0; i < K; i += 32)
std::shuffle(pHost + i, pHost + std::min(i + 32, K), gen);
});
benchmarkPermutedCopy("fully shuffled", [pHost, &gen](int K) {
for (int i = 0; i < K; ++i)
pHost[i] = i;
std::shuffle(pHost, pHost + K, gen);
});
// Task 1b.3) and 1b.4)
for (int K : kBufferSizes) {
int numBlocks = (K + threadsPerBlock - 1) / threadsPerBlock;
double dt1 = benchmark(pickN(K), [=]() {
// additionKernel<<<numBlocks, threadsPerBlock>>>(K, aDev, bDev);
CUDA_LAUNCH(additionKernel, numBlocks, threadsPerBlock, K, aDev, bDev);
});
double dt2 = benchmark(pickN(K), [=]() {
// repeatedAdditionKernel<<<numBlocks, threadsPerBlock>>>(K, aDev, bDev);
CUDA_LAUNCH(repeatedAdditionKernel, numBlocks, threadsPerBlock, K, aDev, bDev);
});
double dt3 = benchmark(pickN(K), [=]() {
// optimizedRepeatedAdditionKernel1<<<numBlocks, threadsPerBlock>>>(K, aDev, bDev);
CUDA_LAUNCH(optimizedRepeatedAdditionKernel1, numBlocks, threadsPerBlock, K, aDev, bDev);
});
double dt4 = benchmark(pickN(K), [=]() {
// optimizedRepeatedAdditionKernel2<<<numBlocks, threadsPerBlock>>>(K, aDev, bDev);
CUDA_LAUNCH(optimizedRepeatedAdditionKernel2, numBlocks, threadsPerBlock, K, aDev, bDev);
});
printf("a+b 1x -> %4.1f GFLOP/s 100x -> %5.1f GFLOP/s restrict %6.1f GFLOP/s tmp var %6.1f GFLOP/s\n",
1e-9 * K / dt1,
1e-9 * 100 * K / dt2,
1e-9 * 100 * K / dt3,
1e-9 * 100 * K / dt4);
}
CUDA_CHECK(cudaFreeHost(aHost));
CUDA_CHECK(cudaFreeHost(pHost));
CUDA_CHECK(cudaFree(pDev));
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
}
int main() {
subtask_b();
}
#include "utils.h"
#include <numeric>
#include <omp.h>
#include <vector>
using ll = long long;
/// Uniformly split the range [0..K-1] to numThreads and evaluate
/// what belongs to the current thread.
/// This function is used by both CPU and GPU codes.
__device__ __host__ double leibnizSeries(ll K, int threadId, int numThreads) {
// [0..K) --> [begin..end)
ll begin = K * threadId / numThreads;
ll end = K * (threadId + 1) / numThreads;
// We go from end to begin, because in principle summing from smaller to
// larger numbers results in a better accuracy. Not really effective here
// though.
// Here we try to avoid having a special variable for (-1)^k (not to
// mention std::pow...). Note that this optimization might be
// counterproductive for CPU codes since it may prevent autovectorization.
// However, it seems that neither clang nor gcc vectorize this loop anyway
// (not even with --fast-math).
double sum = 0.0;
for (ll k = end - 1; k >= begin; --k)
sum = 1.0 / (2 * k + 1) - sum;
if (begin % 2 == 1)
sum = -sum; // Correct the sign.
// This is the same code, but going from start to to end.
// for (ll k = begin; k < end; ++k)
// sum = 1.0 / (2 * k + 1) - sum;
// if (end % 2 == 0)
// sum = -sum; // Correct the sign.
return sum;
}
// Compute the sum of the Leibniz series. Each thread takes care of a subset of terms.
__global__ void leibnizKernel(ll K, double *partialSums) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int numThreads = gridDim.x * blockDim.x;
partialSums[idx] = leibnizSeries(K, idx, numThreads);
}
/// Run the CUDA code for the given number of blocks and threads/block.
void runCUDA(ll K, int numBlocks, int threadsPerBlock) {
int numThreads = numBlocks * threadsPerBlock;
// Allocate the device and host buffers.
double *sumsDev;
double *sumsHost;
CUDA_CHECK(cudaMalloc(&sumsDev, numThreads * sizeof(double)));
CUDA_CHECK(cudaMallocHost(&sumsHost, numThreads * sizeof(double)));
CUDA_CHECK(cudaMemset(sumsDev, 0, numThreads * sizeof(double)));
double dt = benchmark(100, [numBlocks, threadsPerBlock, K, sumsDev]() {
// leibnizKernel<<<numBlocks, threadsPerBlock>>>(K, sumsDev);
CUDA_LAUNCH(leibnizKernel, numBlocks, threadsPerBlock, K, sumsDev);
});
CUDA_CHECK(cudaMemcpy(sumsHost, sumsDev, numThreads * sizeof(double), cudaMemcpyDeviceToHost));
double sum = std::accumulate(sumsHost, sumsHost + numThreads, 0.0);
double pi = 4 * sum;
printf("CUDA blocks=%5d threads/block=%4d iter/thread=%5lld pi=%.12f rel error=%.2g Gterms/s=%.1f\n",
numBlocks, threadsPerBlock, K / numThreads, pi, (pi - M_PI) / M_PI,
1e-9 * K / dt);
CUDA_CHECK(cudaFreeHost(sumsHost));
CUDA_CHECK(cudaFree(sumsDev));
}
/// Run the OpenMP variant of the code.
void runOpenMP(ll K, int numThreads) {
double sum;
double dt = benchmark(10, [K, numThreads, &sum]() {
sum = 0.0;
#pragma omp parallel num_threads(numThreads)
{
int threadId = omp_get_thread_num();
int numThreads = omp_get_num_threads();
double local = leibnizSeries(K, threadId, numThreads);
#pragma omp atomic
sum += local;
}
});
double pi = 4 * sum;
printf("OpenMP threads=%d pi=%.16g rel error=%.2g Gterms/s=%.1f\n",
numThreads, pi, (pi - M_PI) / M_PI, 1e-9 * K / dt);
};
void subtask_c() {
constexpr ll K = 2LL << 30;
runCUDA(K, 512, 512);
runCUDA(K, 512, 1024);
runCUDA(K, 1024, 128);
runCUDA(K, 1024, 256);
runCUDA(K, 1024, 512);
runCUDA(K, 1024, 1024);
runCUDA(K, 32768, 64);
runCUDA(K, 32768, 128);
runCUDA(K, 32768, 1024);
runOpenMP(K, 12);
}
int main() {
subtask_c();
}
#pragma once
#include <chrono>
#include <cstdio>
#include <cuda_runtime.h>
/*
* Benchmark the given function `N` times and return the average execution time.
*/
template <typename Func>
double benchmark(int N, Func func) {
for (int i = 0; i < N / 10 + 1; ++i)
func(); // Warmup.
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < N; ++i)
func();
cudaDeviceSynchronize();
auto t1 = std::chrono::steady_clock::now();
auto ns = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
double dt = 1e-9 * ns / N;
return dt;
}
/// Print the error message if a CUDA API execution failed.
void _cudaCheck(const char *file, int line, const char *func, cudaError_t code) {
if (code != cudaSuccess) {
fprintf(stderr, "CUDA Error at %s:%d in %s:%s\n",
file, line, func, cudaGetErrorString(code));
exit(1);
}
}
// Wrap every cuda API with this check.
#define CUDA_CHECK(cmd) _cudaCheck(__FILE__, __LINE__, __PRETTY_FUNCTION__, (cmd))
/// Replace
/// kernel<<<blocks, threads>>>(arg1, ...);
/// with
/// CUDA_LAUNCH(kernel, blocks, threads, arg1, ...);
#define CUDA_LAUNCH(kernel, blocks, threads, ...) do { \
cudaGetLastError(); \
kernel<<<(blocks), (threads)>>>(__VA_ARGS__); \
_cudaCheck(__FILE__, __LINE__, __PRETTY_FUNCTION__, cudaGetLastError()); \
} while (0)
nbody_0
nbody_a
nbody_b
nbody_c
nbody_d
CUFLAGS=-O3 -std=c++14 --compiler-options "-Wall -Wextra"
.PHONY: all clean
all: nbody_0 nbody_a nbody_b nbody_c nbody_d
@true
clean:
rm -f *.o nbody_0 nbody_a nbody_b nbody_c nbody_d
%.o: %.cu utils.h
nvcc $(CUFLAGS) $< -c -o $@
%: %.o main.o
nvcc $(CUFLAGS) $^ -o $@
// Do not edit!
#include "utils.h"
#include <chrono>
#include <random>
/// Depending on which target is compiling, different .o object file will be
/// linked.
void computeForces(int N, const double3 *pDev, double3 *fDev);
int main() {
const int M = 60; // Lattice size.
const int N = M * M * M; // Number of atoms.
const double L = 100.0; // Domain size.
const int numSteps = 4;
double3 *pDev;
double3 *fDev;
double3 *pHost;
double3 *fHost;
CUDA_CHECK(cudaMalloc(&pDev, N * sizeof(double3)));
CUDA_CHECK(cudaMalloc(&fDev, N * sizeof(double3)));
CUDA_CHECK(cudaMallocHost(&pHost, N * sizeof(double3)));
CUDA_CHECK(cudaMallocHost(&fHost, N * sizeof(double3)));
// Generate a lattice of points (+ noise).
std::mt19937 gen{12345};
const double h = L / M;
std::uniform_real_distribution<double> distr(-0.1 * h, 0.1 * h);
for (int iz = 0; iz < M; ++iz)
for (int iy = 0; iy < M; ++iy)
for (int ix = 0; ix < M; ++ix) {
const int i = ix + M * (iy + M * iz);
pHost[i] = double3{ix * h + distr(gen),
iy * h + distr(gen),
iz * h + distr(gen)};
}
// Upload initial state to the GPU.
CUDA_CHECK(cudaMemcpy(pDev, pHost, N * sizeof(double3), cudaMemcpyHostToDevice));
auto compute = [=]() {
computeForces(N, pDev, fDev);
};
// Verification.
compute();
CUDA_CHECK(cudaMemcpy(fHost, fDev, N * sizeof(double3), cudaMemcpyDeviceToHost));
double fx = 0.0;
double fy = 0.0;
double fz = 0.0;
double ff = 0.0;
for (int i = 0; i < N; ++i) {
fx += fHost[i].x;
fy += fHost[i].y;
fz += fHost[i].z;
ff += fHost[i].x * fHost[i].x + fHost[i].y * fHost[i].y + fHost[i].z * fHost[i].z;
}
printf("Statistics: <F>=%g %g %g <F^2>=%g\n", fx, fy, fz, ff);
// More warmup.
for (int step = 0; step < 1; ++step)
compute();
// Benchmark.
CUDA_CHECK(cudaDeviceSynchronize());
auto t0 = std::chrono::steady_clock::now();
for (int step = 0; step < numSteps; ++step)
compute();
CUDA_CHECK(cudaDeviceSynchronize());
auto t1 = std::chrono::steady_clock::now();
double ns = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
double seconds_per_ts = 1e-9 * ns / numSteps;
printf("Average execution time: %.1f ms\n", 1e3 * seconds_per_ts);
CUDA_CHECK(cudaFreeHost(fHost));
CUDA_CHECK(cudaFreeHost(pHost));
CUDA_CHECK(cudaFree(fDev));
CUDA_CHECK(cudaFree(pDev));
return 0;
}
#include <cuda_runtime.h>
__global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
f[idx] = double3{0.0, 0.0, 0.0};
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
// Instead of skipping the i == idx case, add 1e-150 to avoid division
// by zero. (dx * inv_r will be exactly 0.0)
double r = sqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_r = 1 / r;
f[idx].x += dx * inv_r * inv_r * inv_r;
f[idx].y += dy * inv_r * inv_r * inv_r;
f[idx].z += dz * inv_r * inv_r * inv_r;
}
}
void computeForces(int N, const double3 *p, double3 *f) {
constexpr int numThreads = 1024;
int numBlocks = (N + numThreads - 1) / numThreads;
computeForcesKernel<<<numBlocks, numThreads>>>(N, p, f);
}
#include <cuda_runtime.h>
__global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
double3 ftot{0.0, 0.0, 0.0};
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
double r = sqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_r = 1 / r;
ftot.x += dx * inv_r * inv_r * inv_r;
ftot.y += dy * inv_r * inv_r * inv_r;
ftot.z += dz * inv_r * inv_r * inv_r;
}
f[idx] = ftot;
}
void computeForces(int N, const double3 *p, double3 *f) {
constexpr int numThreads = 1024;
int numBlocks = (N + numThreads - 1) / numThreads;
computeForcesKernel<<<numBlocks, numThreads>>>(N, p, f);
}
#include <cuda_runtime.h>
__global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
double3 ftot{0.0, 0.0, 0.0};
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
double inv_r = 1 / sqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_rrr = inv_r * inv_r * inv_r;
ftot.x += dx * inv_rrr;
ftot.y += dy * inv_rrr;
ftot.z += dz * inv_rrr;
}
f[idx] = ftot;
}
void computeForces(int N, const double3 *p, double3 *f) {
constexpr int numThreads = 1024;
int numBlocks = (N + numThreads - 1) / numThreads;
computeForcesKernel<<<numBlocks, numThreads>>>(N, p, f);
}
#include <cuda_runtime.h>
__global__ void computeForcesKernel(int N, const double3 *p, double3 *f) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
double3 ftot{0.0, 0.0, 0.0};
for (int i = 0; i < N; ++i) {
double dx = p[i].x - p[idx].x;
double dy = p[i].y - p[idx].y;
double dz = p[i].z - p[idx].z;
double inv_r = rsqrt(1e-150 + dx * dx + dy * dy + dz * dz);
double inv_rrr = inv_r * inv_r * inv_r;
ftot.x += dx * inv_rrr;
ftot.y += dy * inv_rrr;
ftot.z += dz * inv_rrr;
}
f[idx] = ftot;
}