[ex10] Solutions

parent 3477d708
# Programming Techniques for Scientific Simulations I
# HS 2020
# Exercise 10
cmake_minimum_required (VERSION 3.15)
project (ex10-matrix-multiplication)
set (CMAKE_CXX_STANDARD 17)
set (CMAKE_CXX_STANDARD_REQUIRED TRUE)
set (CMAKE_CXX_EXTENSIONS FALSE)
add_compile_options (-Wall -Wextra -Wpedantic -march=native)
add_executable (main main.cpp mm0.cpp mm1.cpp mm2.cpp mm3.cpp mm_blas.cpp mm_eigen.cpp)
find_package (BLAS REQUIRED)
target_link_libraries (main ${BLAS_LINKER_FLAGS} ${BLAS_LIBRARIES})
find_package (Eigen3 REQUIRED NO_MODULE)
target_link_libraries (main Eigen3::Eigen)
/*
* Programming Techniques for Scientific Simulations I
* HS 2019
* Exercise 10
*/
#include <iostream>
#include <chrono>
#include <cstdlib>
#include <random>
#include "matrix_multiplication.hpp"
double benchmark(function_t f, matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept {
for (std::size_t i = 0; i < N * N; ++i) {
C[i] = 0;
}
auto start = std::chrono::high_resolution_clock::now();
f(A, B, C, N);
auto end = std::chrono::high_resolution_clock::now();
return std::chrono::duration<double>(end - start).count();
}
int main() {
std::mt19937 gen;
std::uniform_real_distribution<double> dis(0, 1);
{
constexpr int N = 16;
constexpr double tolerance = 1e-6;
matrix_t A(N * N);
matrix_t B(N * N);
for (std::size_t j = 0; j < N; ++j) {
for (std::size_t i = 0; i < N; ++i) {
A[i + j * N] = dis(gen);
B[i + j * N] = dis(gen);
}
}
#define TEST(f) do { \
matrix_t C(N * N); \
for (int i = 0; i < N * N; ++i) { \
C[i] = 0; \
} \
f(A, B, C, N); \
for (int i = 0; i < N; ++i) { \
for (int j = 0; j < N; ++j) { \
if (std::abs(C0[i + j * N] - C[i + j * N]) > tolerance) { \
std::cerr << #f << " incorrect" << '\n' << "C0" << '\n'; \
for (int j = 0; j < N; ++j) { \
for (int i = 0; i < N; ++i) { \
std::cerr << C0[i + j * N] << ' '; \
} \
std::cerr << '\n'; \
} \
std::cerr << "C" << '\n'; \
for (int j = 0; j < N; ++j) { \
for (int i = 0; i < N; ++i) { \
std::cerr << C[i + j * N] << ' '; \
} \
std::cerr << '\n'; \
} \
exit(1); \
} \
} \
} \
} while (false)
matrix_t C0(N * N);
mm0(A, B, C0, N);
TEST(mm0);
TEST(mm1);
TEST(mm2);
TEST(mm3);
TEST(mm_blas);
TEST(mm_eigen);
}
constexpr int runs = 10;
#define BENCHMARK(f) do { \
for (int i = 0; i < runs; ++i) { \
auto time = benchmark(f, A, B, C, N); \
std::cout << #f << ',' << N << ',' << time << std::endl; \
} \
} while (false)
std::cout << "function,size,time" << std::endl;
for (int N = 4; N <= 1024; N *= 2) {
matrix_t A(N * N);
matrix_t B(N * N);
matrix_t C(N * N);
for (std::size_t i = 0; i < N * N; ++i) {
A[i] = dis(gen);
B[i] = dis(gen);
}
BENCHMARK(mm0);
BENCHMARK(mm1);
BENCHMARK(mm2);
BENCHMARK(mm3);
BENCHMARK(mm_blas);
BENCHMARK(mm_eigen);
}
#undef BENCHMARK
}
/*
* Programming Techniques for Scientific Simulations I
* HS 2019
* Exercise 10
*/
#pragma once
#include <cstdint>
#include <vector>
using value_t = double;
using matrix_t = std::vector<value_t>;
using function_t = void (*)(matrix_t const &, matrix_t const &, matrix_t &, std::size_t N);
void mm0( matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept;
void mm1( matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept;
void mm2( matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept;
void mm3( matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept;
void mm_blas( matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept;
void mm_eigen(matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept;
/*
* Programming Techniques for Scientific Simulations I
* HS 2019
* Exercise 10
*/
#include "matrix_multiplication.hpp"
// Trivial implementation
void mm0(matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept {
for (std::size_t i = 0; i < N; ++i) {
for (std::size_t j = 0; j < N; ++j) {
for (std::size_t k = 0; k < N; ++k) {
C[i + j * N] += A[i + k * N] * B[k + j * N];
}
}
}
}
/*
* Programming Techniques for Scientific Simulations I
* HS 2020
* Exercise 10
*/
#include "matrix_multiplication.hpp"
// Invert loop order for locality
// Lift B out of innermost loop
void mm1(matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept {
for (std::size_t j = 0; j < N; ++j) {
for (std::size_t k = 0; k < N; ++k) {
auto b = B[k + j * N];
for (std::size_t i = 0; i < N; ++i) {
C[i + j * N] += A[i + k * N] * b;
}
}
}
}
/*
* Programming Techniques for Scientific Simulations I
* HS 2020
* Exercise 10
*/
#include "matrix_multiplication.hpp"
#include <cmath> // for std::min
// Blocking
void mm2(matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept {
constexpr std::size_t L1 = 1 << 15;
constexpr std::size_t n = std::sqrt(L1 / (3.0 * sizeof(double)));
for (std::size_t j = 0; j < N; j += n) {
for (std::size_t k = 0; k < N; k += n) {
for (std::size_t i = 0; i < N; i += n) {
// Macro-kernel
for (std::size_t jj = j; jj < std::min(j + n, N); ++jj) {
for (std::size_t kk = k; kk < std::min(k + n, N); ++kk) {
auto b = B[kk + jj * N];
for (std::size_t ii = i; ii < std::min(i + n, N); ++ii) {
C[ii + jj * N] += A[ii + kk * N] * b;
}
}
}
}
}
}
}
/*
* Programming Techniques for Scientific Simulations I
* HS 2020
* Exercise 10
*/
#include "matrix_multiplication.hpp"
#include <cmath> // for std::min
#include <immintrin.h>
// Manual vectorization using intrinsics
void mm3(matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept {
constexpr std::size_t L1 = 1 << 15;
constexpr std::size_t n = std::sqrt(L1 / (3.0 * sizeof(double)));
constexpr int v = 4;
for (std::size_t j = 0; j < N; j += n) {
for (std::size_t k = 0; k < N; k += n) {
for (std::size_t i = 0; i < N; i += n) {
// Macro-kernel
for (std::size_t jj = j; jj < std::min(j + n, N); jj += v) {
for (std::size_t kk = k; kk < std::min(k + n, N); kk += v) {
for (std::size_t ii = i; ii < std::min(i + n, N); ii += v) {
// Micro-kernel
auto a = A.data() + ii + kk * N;
auto b = B.data() + kk + jj * N;
auto c = C.data() + ii + jj * N;
auto a0_ = _mm256_loadu_pd(a + 0 * N);
auto a1_ = _mm256_loadu_pd(a + 1 * N);
auto a2_ = _mm256_loadu_pd(a + 2 * N);
auto a3_ = _mm256_loadu_pd(a + 3 * N);
for (int cc = 0; cc < 4; ++cc) {
auto b00 = _mm256_broadcast_sd(b + 0 + cc * N);
auto b01 = _mm256_broadcast_sd(b + 1 + cc * N);
auto b02 = _mm256_broadcast_sd(b + 2 + cc * N);
auto b03 = _mm256_broadcast_sd(b + 3 + cc * N);
auto c0_ = _mm256_loadu_pd(c + cc * N);
c0_ = _mm256_add_pd(c0_,
_mm256_add_pd(
_mm256_add_pd(
_mm256_mul_pd(a0_, b00),
_mm256_mul_pd(a1_, b01)
),
_mm256_add_pd(
_mm256_mul_pd(a2_, b02),
_mm256_mul_pd(a3_, b03)
)
)
);
_mm256_storeu_pd(c + cc * N, c0_);
}
}
}
}
}
}
}
}
/*
* Programming Techniques for Scientific Simulations I
* HS 2020
* Exercise 10
*/
#include "matrix_multiplication.hpp"
extern "C" void dgemm_(
char const & TRANSA,
char const & TRANSB,
int const & M,
int const & N,
int const & K,
double const & alpha,
double const * A,
int const & LDA,
double const * B,
int const & LDB,
double const & beta,
double * C,
int const & LDC
);
void mm_blas(matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept {
dgemm_('N', 'N', N, N, N, 1, A.data(), N, B.data(), N, 0, C.data(), N);
}
/*
* Programming Techniques for Scientific Simulations I
* HS 2020
* Exercise 10
*/
#include "matrix_multiplication.hpp"
#include <Eigen/Core>
void mm_eigen(matrix_t const & A, matrix_t const & B, matrix_t & C, std::size_t N) noexcept {
auto const Ae = Eigen::Map<Eigen::MatrixXd const>(A.data(), N, N);
auto const Be = Eigen::Map<Eigen::MatrixXd const>(B.data(), N, N);
auto Ce = Eigen::Map<Eigen::MatrixXd>( C.data(), N, N);
Ce = Ae * Be;
}
#!/usr/bin/env python3
# Programming Techniques for Scientific Simulations I
# HS 2020
# Exercise 10
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
def main():
if not (2 <= len(sys.argv) and len(sys.argv) <= 3):
print('Usage: plot.py input_file [output_file]')
exit(1)
input_file = sys.argv[1]
if len(sys.argv) == 3:
output_file = sys.argv[2]
data = pd.read_csv(input_file)
# sns.barplot(x='function', y='time', data=data)
sns.lineplot(x='size', y='time', hue='function', data=data)
plt.gca().set_xscale('log', basex=2)
plt.gca().set_yscale('log', basey=2)
plt.title('Matrix-multiplication benchmark')
plt.xlabel('Function')
plt.ylabel('Time in seconds', rotation=0, horizontalalignment='left')
plt.gca().yaxis.set_label_coords(0, 1)
if len(sys.argv) == 3:
plt.savefig(output_file)
else:
plt.show()
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment