To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit 4d7fcc1b authored by kicici's avatar kicici

CUDA matmul and other codes

parent cbfd504d
*.nvvp
core.*
alignment
launch
matmul
matmul00
matmul01
matmul02
matmul03
matmul04
matmul05
matmul06
matmul07
matmul08
CUFLAGS=-O3 -std=c++14 --compiler-options "-Wall -Wextra -fopenmp" -gencode arch=compute_60,code=sm_60
CODES=alignment \
launch \
matmul00 \
matmul01 \
matmul02 \
matmul03 \
matmul04 \
matmul05 \
matmul06 \
matmul07 \
matmul08
.PHONY: all clean
all: $(CODES)
@true
clean:
rm -f $(CODES)
%: %.cu utils.h
nvcc $(CUFLAGS) $< -o $@
// Use
// cuobjdump -sass alignment
// to see the difference between compiled functions.
#include <cuda_runtime.h>
__global__ void copy4x1(const int *from, int * __restrict__ to) {
to[0] = from[0];
to[1] = from[1];
to[2] = from[2];
to[3] = from[3];
}
__global__ void copy2x2(const int2 *from, int2 * __restrict__ to) {
to[0] = from[0];
to[1] = from[1];
}
__global__ void copy1x4(const int4 *from, int4 * __restrict__ to) {
to[0] = from[0];
}
int main() {
}
#include "utils.h"
__global__ void dummyKernel() {
}
__global__ void dummyKernel2(double x) {
double x0 = 1.0;
double x1 = x0 * x;
double x2 = x1 * x;
double x3 = x2 * x;
double x4 = x3 * x;
double x5 = x4 * x;
double x6 = x5 * x;
double x7 = x6 * x;
double x8 = x7 * x;
double x9 = x8 * x;
double x10 = x9 * x;
double y = 0.0;
y += x10;
y += x9;
y += x8;
y += x7;
y += x6;
y += x5;
y += x4;
y += x3;
y += x2;
y += x1;
if (y == 0.1)
printf("Hello x=%f\n", x);
}
void benchmarkLaunch() {
for (int threads = 128; threads <= 1024; threads += 128) {
for (int i = 0; i < 3; ++i) {
auto t0 = std::chrono::steady_clock::now();
dummyKernel<<<1, threads>>>();
auto t1 = std::chrono::steady_clock::now();
double ns = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
printf("threads=%d dt=%.1f us\n", threads, ns / 1e3);
}
}
for (int threads = 128; threads <= 1024; threads += 128) {
auto t0 = std::chrono::steady_clock::now();
dummyKernel2<<<1, threads>>>(1.0);
auto t1 = std::chrono::steady_clock::now();
double dtFirst = 1e-9 * (double)std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
double dtLater = benchmark(1, [threads]() {
// dummyKernel2<<<1024, threads>>>(1.123);
CUDA_LAUNCH(dummyKernel2, 1024, threads,
1.123);
});
printf("threads=%4d first=%4.1f us avg=%4.1f us Meval/s=%6.1f\n",
threads, 1e6 * dtFirst, 1e6 * dtLater, 1e-6 * threads / dtLater);
}
}
int main() {
benchmarkLaunch();
}
#!/usr/bin/env python3
import sys
def generate(n):
return '\n'.join([
'__global__ void dummyKernel2(double x) {',
' double x0 = 1.0;',
*(f' double x{k+1} = x{k} * x;' for k in range(n)),
' double y = 0.0;',
*(f' y += x{k+1};' for k in reversed(range(n))),
' if (y == 0.1)',
' printf("Hello x=%f\\n", x);',
'}',
])
print(generate(int(sys.argv[1])))
// Skeleton code.
#include "utils.h"
using real = double;
__global__ void matMulKernel(int N, const real *a, const real *b, real *c) {
// TODO
}
// c = a * b
void matMulCPU(int N, const real *a, const real *b, real *c) {
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
real sum = 0;
for (int k = 0; k < N; ++k)
sum += a[iy * N + k] * b[k * N + ix];
c[iy * N + ix] = sum;
}
}
void benchmarkMatrixMultiplication(int N, bool check) {
real *aHost;
real *bHost;
real *cHost;
real *aDev;
real *bDev;
real *cDev;
// Allocate.
CUDA_CHECK(cudaMallocHost(&aHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&bHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&cHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&aDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&bDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&cDev, N * N * sizeof(real)));
// Prepare A and B.
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
aHost[iy * N + ix] = iy + ix;
bHost[iy * N + ix] = ix * ix + iy;
}
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(bDev, bHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
// Compute C = A * B on GPU.
double dt = benchmark(100, []() {
// TODO
});
double gflops = 1e-9 * 2LL * N * N * N / dt;
printf("N=%d GFLOP/s=%.1f\n", N, gflops);
// Check correctnes.
if (check) {
matMulCPU(N, aHost, bHost, cHost);
double *tmpHost = aHost;
CUDA_CHECK(cudaMemcpy(tmpHost, cDev, N * N * sizeof(real), cudaMemcpyDeviceToHost));
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
if (tmpHost[iy * N + ix] != cHost[iy * N + ix]) {
fprintf(stderr, "Incorrect result at [%d][%d] --> host=%f gpu=%f\n",
iy, ix, cHost[iy * N + ix], tmpHost[iy * N + ix]);
exit(1);
}
}
printf("GPU result correct.\n");
}
// Deallocate.
CUDA_CHECK(cudaFree(cDev));
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(cHost));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
int main() {
benchmarkMatrixMultiplication(256, true);
benchmarkMatrixMultiplication(3072, false);
}
// Basic matrix multiplication.
#include "utils.h"
using real = double;
__global__ void matMulKernel(int N, const real *a, const real *b, real *c) {
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix >= N || iy >= N)
return;
real sum = 0;
for (int k = 0; k < N; ++k)
sum += a[iy * N + k] * b[k * N + ix];
c[iy * N + ix] = sum;
}
// c = a * b
void matMulCPU(int N, const real *a, const real *b, real *c) {
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
real sum = 0;
for (int k = 0; k < N; ++k)
sum += a[iy * N + k] * b[k * N + ix];
c[iy * N + ix] = sum;
}
}
void benchmarkMatrixMultiplication(int N, bool check) {
real *aHost;
real *bHost;
real *cHost;
real *aDev;
real *bDev;
real *cDev;
// Allocate.
CUDA_CHECK(cudaMallocHost(&aHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&bHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&cHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&aDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&bDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&cDev, N * N * sizeof(real)));
// Prepare A and B.
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
aHost[iy * N + ix] = iy + ix;
bHost[iy * N + ix] = ix * ix + iy;
}
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(bDev, bHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(cDev, 0, N * N * sizeof(real)));
// Compute C = A * B on GPU.
double dt = benchmark(10, [N, aDev, bDev, cDev]() {
dim3 threads(32, 32, 1);
dim3 blocks((N + threads.x - 1) / threads.x,
(N + threads.y - 1) / threads.y,
1);
matMulKernel<<<blocks, threads>>>(N, aDev, bDev, cDev);
});
double gflops = 1e-9 * 2LL * N * N * N / dt;
printf("N=%d GFLOP/s=%.1f\n", N, gflops);
// Check correctnes.
if (check) {
matMulCPU(N, aHost, bHost, cHost);
double *tmpHost = aHost;
CUDA_CHECK(cudaMemcpy(tmpHost, cDev, N * N * sizeof(real), cudaMemcpyDeviceToHost));
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
if (tmpHost[iy * N + ix] != cHost[iy * N + ix]) {
fprintf(stderr, "Incorrect result at [%d][%d] --> host=%f gpu=%f\n",
iy, ix, cHost[iy * N + ix], tmpHost[iy * N + ix]);
exit(1);
}
}
// printf("GPU result correct.\n");
}
// Deallocate.
CUDA_CHECK(cudaFree(cDev));
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(cHost));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
int main() {
benchmarkMatrixMultiplication(256, true);
benchmarkMatrixMultiplication(259, true);
benchmarkMatrixMultiplication(3072, false);
}
// Alignment issue. Use [iy * M + x] instead of [iy * N + x],
// where M = (N + 31) / 32 * 32.
#include "utils.h"
using real = double;
__global__ void matMulKernel(int N, int M, const real *a, const real *b, real *c) {
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix >= N || iy >= N)
return;
real sum = 0;
for (int k = 0; k < N; ++k)
sum += a[iy * M + k] * b[k * M + ix];
c[iy * N + ix] = sum;
}
// c = a * b
void matMulCPU(int N, int M, const real *a, const real *b, real *c) {
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
real sum = 0;
for (int k = 0; k < N; ++k)
sum += a[iy * M + k] * b[k * M + ix];
c[iy * N + ix] = sum;
}
}
void benchmarkMatrixMultiplication(int N, int M, bool check) {
real *aHost;
real *bHost;
real *cHost;
real *aDev;
real *bDev;
real *cDev;
// Allocate.
CUDA_CHECK(cudaMallocHost(&aHost, N * M * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&bHost, N * M * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&cHost, N * M * sizeof(real)));
CUDA_CHECK(cudaMalloc(&aDev, N * M * sizeof(real)));
CUDA_CHECK(cudaMalloc(&bDev, N * M * sizeof(real)));
CUDA_CHECK(cudaMalloc(&cDev, N * M * sizeof(real)));
// Prepare A and B.
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
aHost[iy * M + ix] = iy + ix;
bHost[iy * M + ix] = ix * ix + iy;
}
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * M * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(bDev, bHost, N * M * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(cDev, 0, N * M * sizeof(real)));
// Compute C = A * B on GPU.
double dt = benchmark(10, [N, M, aDev, bDev, cDev]() {
dim3 threads(32, 32, 1);
dim3 blocks((N + threads.x - 1) / threads.x,
(N + threads.y - 1) / threads.y,
1);
matMulKernel<<<blocks, threads>>>(N, M, aDev, bDev, cDev);
});
double gflops = 1e-9 * 2LL * N * N * N / dt;
printf("N=%d M=%d GFLOP/s=%.1f\n", N, M, gflops);
// Check correctnes.
if (check) {
matMulCPU(N, M, aHost, bHost, cHost);
double *tmpHost = aHost;
CUDA_CHECK(cudaMemcpy(tmpHost, cDev, N * M * sizeof(real), cudaMemcpyDeviceToHost));
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
if (tmpHost[iy * M + ix] != cHost[iy * M + ix]) {
fprintf(stderr, "Incorrect result at [%d][%d] --> host=%f gpu=%f\n",
iy, ix, cHost[iy * M + ix], tmpHost[iy * M + ix]);
exit(1);
}
}
// printf("GPU result correct.\n");
}
// Deallocate.
CUDA_CHECK(cudaFree(cDev));
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(cHost));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
int main() {
benchmarkMatrixMultiplication(253, 253, true);
benchmarkMatrixMultiplication(253, 256, true);
benchmarkMatrixMultiplication(3069, 3069, false);
benchmarkMatrixMultiplication(3069, 3072, false);
}
// (Removed M for simplicity, we will consider only N % 32 == 0 cases.)
// __syncthreads() to improve cache access.
#include "utils.h"
using real = double;
__global__ void matMulKernel(int N, const real *a, const real *b, real *c) {
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix >= N || iy >= N)
return;
real sum = 0;
for (int k = 0; k < N; ++k) {
sum += a[iy * N + k] * b[k * N + ix];
__syncthreads();
}
c[iy * N + ix] = sum;
}
// c = a * b
void matMulCPU(int N, const real *a, const real *b, real *c) {
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
real sum = 0;
for (int k = 0; k < N; ++k)
sum += a[iy * N + k] * b[k * N + ix];
c[iy * N + ix] = sum;
}
}
void benchmarkMatrixMultiplication(int N, bool check) {
real *aHost;
real *bHost;
real *cHost;
real *aDev;
real *bDev;
real *cDev;
// Allocate.
CUDA_CHECK(cudaMallocHost(&aHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&bHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&cHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&aDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&bDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&cDev, N * N * sizeof(real)));
// Prepare A and B.
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
aHost[iy * N + ix] = iy + ix;
bHost[iy * N + ix] = ix * ix + iy;
}
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(bDev, bHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(cDev, 0, N * N * sizeof(real)));
// Compute C = A * B on GPU.
double dt = benchmark(10, [N, aDev, bDev, cDev]() {
dim3 threads(32, 32, 1);
dim3 blocks((N + threads.x - 1) / threads.x,
(N + threads.y - 1) / threads.y,
1);
matMulKernel<<<blocks, threads>>>(N, aDev, bDev, cDev);
});
double gflops = 1e-9 * 2LL * N * N * N / dt;
printf("N=%d GFLOP/s=%.1f\n", N, gflops);
// Check correctnes.
if (check) {
matMulCPU(N, aHost, bHost, cHost);
double *tmpHost = aHost;
CUDA_CHECK(cudaMemcpy(tmpHost, cDev, N * N * sizeof(real), cudaMemcpyDeviceToHost));
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
if (tmpHost[iy * N + ix] != cHost[iy * N + ix]) {
fprintf(stderr, "Incorrect result at [%d][%d] --> host=%f gpu=%f\n",
iy, ix, cHost[iy * N + ix], tmpHost[iy * N + ix]);
exit(1);
}
}
// printf("GPU result correct.\n");
}
// Deallocate.
CUDA_CHECK(cudaFree(cDev));
CUDA_CHECK(cudaFree(bDev));
CUDA_CHECK(cudaFree(aDev));
CUDA_CHECK(cudaFreeHost(cHost));
CUDA_CHECK(cudaFreeHost(bHost));
CUDA_CHECK(cudaFreeHost(aHost));
}
int main() {
benchmarkMatrixMultiplication(256, true);
benchmarkMatrixMultiplication(259, true);
benchmarkMatrixMultiplication(3072, false);
}
// Instruction-level parallelism. Each thread now computes 4 different values.
#include "utils.h"
using real = double;
__global__ void matMulKernel(int N, const real *a, const real *b, real *c) {
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix >= N || iy >= N)
return;
real sum0 = 0;
real sum1 = 0;
real sum2 = 0;
real sum3 = 0;
for (int k = 0; k < N; ++k) {
sum0 += a[(4 * iy + 0) * N + k] * b[k * N + ix];
sum1 += a[(4 * iy + 1) * N + k] * b[k * N + ix];
sum2 += a[(4 * iy + 2) * N + k] * b[k * N + ix];
sum3 += a[(4 * iy + 3) * N + k] * b[k * N + ix];
__syncthreads();
}
c[(4 * iy + 0) * N + ix] = sum0;
c[(4 * iy + 1) * N + ix] = sum1;
c[(4 * iy + 2) * N + ix] = sum2;
c[(4 * iy + 3) * N + ix] = sum3;
}
// c = a * b
void matMulCPU(int N, const real *a, const real *b, real *c) {
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
real sum = 0;
for (int k = 0; k < N; ++k)
sum += a[iy * N + k] * b[k * N + ix];
c[iy * N + ix] = sum;
}
}
void benchmarkMatrixMultiplication(int N, bool check) {
real *aHost;
real *bHost;
real *cHost;
real *aDev;
real *bDev;
real *cDev;
// Allocate.
CUDA_CHECK(cudaMallocHost(&aHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&bHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMallocHost(&cHost, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&aDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&bDev, N * N * sizeof(real)));
CUDA_CHECK(cudaMalloc(&cDev, N * N * sizeof(real)));
// Prepare A and B.
for (int iy = 0; iy < N; ++iy)
for (int ix = 0; ix < N; ++ix) {
aHost[iy * N + ix] = iy + ix;
bHost[iy * N + ix] = ix * ix + iy;
}
CUDA_CHECK(cudaMemcpy(aDev, aHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(bDev, bHost, N * N * sizeof(real), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(cDev, 0, N * N * sizeof(real)));
// Compute C = A * B on GPU.
double dt = benchmark(10, [N, aDev, bDev, cDev]() {
dim3 threads(32, 32, 1);
dim3 blocks((N + threads.x - 1) / threads.x,
(N + threads.y * 4 - 1) / (threads.y * 4),
1);
matMulKernel<<<blocks, threads>>>(N, aDev, bDev, cDev);
});
double gflops = 1e-9 * 2LL * N * N * N / dt;
printf("N=%d GFLOP/s=%.1f\n", N, gflops);
// Check correctnes.
if (check) {
matMulCPU(N, aHost, bHost, cHost);
double *tmpHost = aHost;
CUDA_CHECK(cudaMemcpy(tmpHost, cDev, N * N * sizeof(real), cudaMemcpyDeviceToHost));