To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit e4b4178c authored by Donjan Rodic's avatar Donjan Rodic
Browse files

added ex07 solution

parent 17ee0544
CXX ?= c++
CXXFLAGS ?= -std=c++14 -Wall -Wextra -Wpedantic -fopenmp -O3 -march=native -DNDEBUG
LIBS = -lblas
all: gemm
gemm: gemm.cpp matrix.hpp aligned_allocator.hpp
$(CXX) $(CXXFLAGS) gemm.cpp -o gemm $(LIBS)
// Example codes for HPCSE course
// (c) 2012 Andreas Hehn, ETH Zurich
#ifndef HPCSE_ALIGNED_ALLOCATOR_HPP
#define HPCSE_ALIGNED_ALLOCATOR_HPP
#ifdef _WIN32
#include <malloc.h>
#else
#include <cstdlib>
#endif
#include <memory>
#if __cplusplus >= 201103L
#define NOEXCEPT_SPEC noexcept
#else
#define NOEXCEPT_SPEC
#endif
namespace hpcse {
// Alignment must be a power of 2 !
template <typename T, unsigned int Alignment>
class aligned_allocator {
public:
typedef T* pointer;
typedef T const* const_pointer;
typedef T& reference;
typedef T const& const_reference;
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
template <typename U>
struct rebind {
typedef aligned_allocator<U,Alignment> other;
};
aligned_allocator() NOEXCEPT_SPEC {
}
aligned_allocator(aligned_allocator const&) NOEXCEPT_SPEC {
}
template <typename U>
aligned_allocator(aligned_allocator<U,Alignment> const& b) NOEXCEPT_SPEC {
}
pointer allocate(size_type n) {
pointer p;
#ifdef _WIN32
p = reinterpret_cast<pointer>(_aligned_malloc(n*sizeof(T), Alignment));
if(p == 0)
throw std::bad_alloc();
#else
if(posix_memalign(reinterpret_cast<void**>(&p), Alignment, n * sizeof(T) ))
throw std::bad_alloc();
#endif
return p;
}
void deallocate(pointer p, size_type) NOEXCEPT_SPEC {
#ifdef _WIN32
_aligned_free(p);
#else
std::free(p);
#endif
}
size_type max_size() const NOEXCEPT_SPEC {
std::allocator<T> a;
return a.max_size();
}
#if __cplusplus >= 201103L
template <typename C, class... Args>
void construct(C* c, Args&&... args) {
new ((void*)c) C(std::forward<Args>(args)...);
}
#else
void construct(pointer p, const_reference t) {
new((void *)p) T(t);
}
#endif
template <typename C>
void destroy(C* c) {
c->~C();
}
bool operator == (aligned_allocator const& a2) const NOEXCEPT_SPEC {
return true;
}
bool operator != (aligned_allocator const& a2) const NOEXCEPT_SPEC {
return false;
}
template <typename U, unsigned int UAlignment>
bool operator == (aligned_allocator<U,UAlignment> const& b) const NOEXCEPT_SPEC {
return false;
}
template <typename U, unsigned int UAlignment>
bool operator != (aligned_allocator<U,UAlignment> const& b) const NOEXCEPT_SPEC {
return true;
}
};
}
#undef NOEXPECT_SPEC
#endif //HPCSE_ALIGNED_ALLOCATOR_HPP
// Example codes for HPC course
// (c) 2016 Donjan Rodic, ETH Zurich
#include "matrix.hpp"
#include <cblas.h>
#include <omp.h>
#include <x86intrin.h>
#include <iostream>
#include <cassert>
#include <algorithm>
#include <chrono>
typedef hpcse::matrix<double,hpcse::row_major> matrix_type;
// WARNING: all security and boundary checks have been removed to facilitate code clarity
// REUSE AT YOUR OWN RISK
////////////////////////////////////////////////////////////////////////////////
// naive implementation
void gemm(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
for(size_t i=0; i < num_rows(a); ++i) {
for(size_t j=0; j < num_cols(b); ++j) {
for(size_t k=0; k < num_cols(a); ++k) {
c(i,j) += a(i,k) * b(k,j);
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// do away with function calling overhead
void gemm_array(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const double* A = a.data();
const double* B = b.data();
double* C = c.data();
const size_t n = num_rows(a);
for(size_t i = 0; i < n; ++i)
for(size_t j = 0; j < n; ++j) {
for(size_t k = 0; k < n; ++k)
C[n*i + j] += A[n*i + k] * B[n*k + j];
}
}
////////////////////////////////////////////////////////////////////////////////
// think about the access order
void gemm_ord(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
for(size_t i=0; i < num_rows(a); ++i) {
for(size_t k=0; k < num_cols(a); ++k) {
for(size_t j=0; j < num_cols(b); ++j) {
c(i,j) += a(i,k) * b(k,j);
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// same as order, but caching values from A
void gemm_ord2(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
for(size_t i=0; i < num_rows(a); ++i) {
for(size_t k=0; k < num_cols(a); ++k) {
const double tmp = a(i,k);
for(size_t j=0; j < num_cols(b); ++j) {
c(i,j) += tmp * b(k,j);
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// same as order, but caching values from A (copying or transposing may help!)
void gemm_copy(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const size_t n = num_rows(a);
double * T = new double[n];
for(size_t j = 0; j < n; ++j) {
for(size_t l = 0; l < n; ++l)
T[l] = b(l,j); // copy j-th B column
for(size_t i = 0; i < n; ++i)
for(size_t k = 0; k < n; ++k)
c(i,j) += a(i,k) * T[k];
}
delete T;
}
////////////////////////////////////////////////////////////////////////////////
// use mtp for unrolling
template<int I>
inline double colexpand(const double * a, double * b, size_t offset) {
return colexpand<I-1>(a, b, offset) + a[offset + I]*b[I] ; // left to right!
}
template<>
inline double colexpand<0>(const double * a, double * b, size_t offset) {
return a[offset+0]*b[0];
}
template<int LEN>
void gemm_exp(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const size_t n = num_rows(a);
double * T = new double[n];
for(size_t j = 0; j < n; ++j) {
for(size_t l = 0; l < n; ++l)
T[l] = b(l,j); // copy j-th B column
for(size_t i = 0; i < n; ++i)
c(i,j) = colexpand<LEN>(a.data(), T, n*i);
}
delete T;
}
////////////////////////////////////////////////////////////////////////////////
// static single (scalar) assignment for ILP
void gemm_ssa(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const double* A = a.data();
const double* B = b.data();
double* C = c.data();
const size_t n = num_rows(a);
for(size_t i = 0; i < n; i++) {
for(size_t j = 0; j < n; j++) {
const double* ap = A + i*n;
const double* bp = B + i*n+j;
double* cp = (double*)C + i*n;
const double* end = cp + n;
while(cp < end)
*cp++ += (*ap) * (*bp++);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// blocking the naive version
void gemm_block(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const size_t X = 32;
const size_t n = num_rows(a);
for(size_t ib = 0; ib < n; ib += X)
for(size_t jb = 0; jb < n; jb += X)
for(size_t kb = 0; kb < n; kb += X)
for(size_t i = ib; i < std::min(ib + X, n); ++i)
for(size_t j = jb; j < std::min(jb + X, n); ++j)
for(size_t k = kb; k < std::min(kb + X, n); ++k)
c(i,j) += a(i,k) * b(k,j);
}
////////////////////////////////////////////////////////////////////////////////
// slightly smarter blocking
void gemm_block2(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const size_t X = 64;
const size_t n = num_rows(a);
for(size_t ib = 0; ib < n; ib += X)
for(size_t kb = 0; kb < n; kb += X)
for(size_t jb = 0; jb < n; jb += X) {
const size_t iend = std::min(ib+X, n);
for(size_t i = ib; i < iend; ++i) {
const size_t kend = std::min(kb + X, n);
for(size_t k = kb; k < kend; ++k) {
const double tmp = a(i,k);
const size_t jend = std::min(jb + X, n);
for(size_t j = jb; j < jend; ++j)
c(i,j) += tmp * b(k,j);
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// omp naive version
void gemm_omp(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
#pragma omp parallel for
for(size_t i=0; i < num_rows(a); ++i) {
for(size_t j=0; j < num_cols(b); ++j) {
for(size_t k=0; k < num_cols(a); ++k) {
c(i,j) += a(i,k) * b(k,j);
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// omp with better loop order, see SIMD lecture
void gemm_omp2(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
#pragma omp parallel for
for(size_t k=0; k < num_cols(a); ++k) {
for(size_t i=0; i < num_rows(a); ++i) {
for(size_t j=0; j < num_cols(b); ++j) {
c(i,j) += a(i,k) * b(k,j);
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Strassen, starts being noticeable for 2000x2000 matrices
void gemm_blockStep(const double * A, const double * B, double * C, size_t n) {
double tmp;
size_t b = 32;
for(size_t ib = 0; ib < n; ib += b)
for(size_t jb = 0; jb < n; jb += b)
for(size_t kb = 0; kb < n; kb += b)
for(size_t i = ib; i < std::min(ib + b, n); ++i)
for(size_t k = kb; k < std::min(kb + b, n); ++k) {
tmp = A[n*i + k];
for(size_t j = jb; j < std::min(jb + b, n); ++j)
C[n*i + j] += tmp * B[n*k + j];
}
}
void gemm_strass_impl(const double * A, const double * B, double * C, size_t n, size_t recurse = 1) {
size_t m = n/2;
size_t N = m*m;
double * M1 = new double[N], * M2 = new double[N], * M3 = new double[N],
* M4 = new double[N], * M5 = new double[N], * M6 = new double[N],
* M7 = new double[N];
std::fill_n(M1, N, 0); std::fill_n(M2, N, 0); std::fill_n(M3, N, 0); std::fill_n(M4, N, 0);
std::fill_n(M5, N, 0); std::fill_n(M6, N, 0); std::fill_n(M7, N, 0);
// M1 = (A11 + A22) * (B11 + B22)
// M2 = (A21 + A22) * B11
// M3 = A11 * (B12 - B22)
// M4 = A22 * (B21 - B11)
// M5 = (A11 + A12) * B22
// M6 = (A21 - A11) * (B11 + B12)
// M7 = (A12 - A22) * (B21 + B22)
//~ for(size_t i = 0; i < m; ++i) // trivial algorithm
//~ for(size_t j = 0; j < m; ++j)
//~ for(size_t k = 0; k < m; ++k) {
//~ M1[m*i + j] += (A11 + A22) * (B11 + B22);
//~ M2[m*i + j] += (A21 + A22) * B11;
//~ M3[m*i + j] += A11 * (B12 - B22);
//~ M4[m*i + j] += A22 * (B21 - B11);
//~ M5[m*i + j] += (A11 + A12) * B22;
//~ M6[m*i + j] += (A21 - A11) * (B11 + B12);
//~ M7[m*i + j] += (A12 - A22) * (B21 + B22);
//~ }
//~ for(size_t i = 0; i < m; ++i) // too slow unit step
//~ for(size_t k = 0; k < m; ++k) {
//~ double tmp1 = (A11 + A22), tmp2 = (A21 + A22), tmp3 = A11, tmp4 = A22,
//~ tmp5 = (A11 + A12), tmp6 = (A21 - A11), tmp7 = (A12 - A22);
//~ for(size_t j = 0; j < m; ++j) {
//~ M1[m*i + j] += tmp1 * (B11 + B22);
//~ M2[m*i + j] += tmp2 * B11;
//~ M3[m*i + j] += tmp3 * (B12 - B22);
//~ M4[m*i + j] += tmp4 * (B21 - B11);
//~ M5[m*i + j] += tmp5 * B22;
//~ M6[m*i + j] += tmp6 * (B11 + B12);
//~ M7[m*i + j] += tmp7 * (B21 + B22);
//~ }
//~ }
double * a1122 = new double[N], * a2122 = new double[N], * a11 = new double[N],
* a22 = new double[N], * a1112 = new double[N], * a21m11 = new double[N],
* a12m22 = new double[N],
* b1122 = new double[N], * b11 = new double[N], * b12m22 = new double[N],
* b21m11 = new double[N], * b22 = new double[N], * b1112 = new double[N],
* b2122 = new double[N];
for(size_t i = 0; i < m; ++i)
for(size_t j = 0; j < m; ++j) {
const size_t Z = m*i + j;
a1122[Z] = A[n*i+j] + A[n*(i+m)+j+m];
a2122[Z] = A[n*(i+m)+j] + A[n*(i+m)+j+m];
a11[Z] = A[n*i+j];
a22[Z] = A[n*(i+m)+j+m];
a1112[Z] = A[n*i+j] + A[n*i+j+m];
a21m11[Z] = A[n*(i+m)+j] - A[n*i+j];
a12m22[Z] = A[n*i+j+m] - A[n*(i+m)+j+m];
b1122[Z] = B[n*i+j] + B[n*(i+m)+j+m];
b11[Z] = B[n*i+j];
b12m22[Z] = B[n*i+j+m] - B[n*(i+m)+j+m];
b21m11[Z] = B[n*(i+m)+j] - B[n*i+j];
b22[Z] = B[n*(i+m)+j+m];
b1112[Z] = B[n*i+j] + B[n*i+j+m];
b2122[Z] = B[n*(i+m)+j] + B[n*(i+m)+j+m];
}
if(recurse > 0) {
gemm_strass_impl(a1122, b1122, M1, m, recurse - 1);
gemm_strass_impl(a2122, b11, M2, m, recurse - 1);
gemm_strass_impl(a11, b12m22, M3, m, recurse - 1);
gemm_strass_impl(a22, b21m11, M4, m, recurse - 1);
gemm_strass_impl(a1112, b22, M5, m, recurse - 1);
gemm_strass_impl(a21m11, b1112, M6, m, recurse - 1);
gemm_strass_impl(a12m22, b2122, M7, m, recurse - 1);
} else {
// Use gemm_blockStep
gemm_blockStep(a1122, b1122, M1, m);
gemm_blockStep(a2122, b11, M2, m);
gemm_blockStep(a11, b12m22, M3, m);
gemm_blockStep(a22, b21m11, M4, m);
gemm_blockStep(a1112, b22, M5, m);
gemm_blockStep(a21m11, b1112, M6, m);
gemm_blockStep(a12m22, b2122, M7, m);
}
delete a1122; delete a2122; delete a11; delete a22; delete a1112; delete a21m11; delete a12m22;
delete b1122; delete b11; delete b12m22; delete b21m11; delete b22; delete b1112; delete b2122;
// C11 = M1 + M4 - M5 + M7
// C12 = M3 + M5
// C21 = M2 + M4
// C22 = M1 - M2 + M3 + M6
for(size_t i = 0; i < m; ++i)
for(size_t j = 0; j < m; ++j) {
const size_t Z = m*i + j;
C[n*i + j] = M1[Z] + M4[Z] - M5[Z] + M7[Z];
C[n*i + j+m] = M3[Z] + M5[Z];
C[n*(i+m) + j] = M2[Z] + M4[Z];
C[n*(i+m) + j+m] = M1[Z] - M2[Z] + M3[Z] + M6[Z];
}
delete M1; delete M2; delete M3; delete M4; delete M5; delete M6; delete M7;
}
void gemm_strass(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
, size_t recurse = 3 // 1 repeat is best with sizes lower than 2000x2000
){
gemm_strass_impl(a.data(), b.data(), c.data(), num_rows(a), recurse);
}
////////////////////////////////////////////////////////////////////////////////
// vectorisation
void gemm_avx(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const double* A = a.data();
const double* B = b.data();
double* C = c.data();
const size_t n = num_rows(a);
size_t column = 0;
for (size_t i = 0; i < n; ++i) {
size_t transposedRow = 0;
for (size_t j = 0; j < n; ++j) {
__m256d sum_avx = _mm256_setzero_pd();
for (size_t k = 0; k < n; k += 4) {
//~ __m256d a = _mm256_load_pd(&At[transposedRow + k]);
__m256d a = _mm256_load_pd(&A[transposedRow + k]);
__m256d b = _mm256_load_pd(&B[column + k]);
#ifdef FMA_AVAILABLE
sum_avx = _mm256_fmadd_pd(a, b, sum_avx);
#else
__m256d dummy = _mm256_mul_pd(a, b);
sum_avx = _mm256_add_pd(dummy, sum_avx);
#endif
}
alignas(64) double sums[4];
_mm256_store_pd(&sums[0], sum_avx);
double sum1 = sums[0] + sums[1];
double sum2 = sums[2] + sums[3];
double sum = sum1 + sum2;
C[column + j] = sum;
transposedRow += n;
}
column += n;
}
}
////////////////////////////////////////////////////////////////////////////////
// blas
void gemm_blas(
matrix_type const& a
, matrix_type const& b
, matrix_type& c
){
const size_t n = num_rows(a); // assuming same n for all (square) matrices
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, n, n, 1.0, a.data(), n, b.data(), n, 1.0, c.data(), n);
}
////////////////////////////////////////////////////////////////////////////////
#define REPEATS 7
std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
double elapsed_seconds;
#define test(alg, a, b, c) \
start = std::chrono::high_resolution_clock::now(); \
for(size_t i = 0; i < REPEATS; ++i) { \
alg(a,b,c); \
} \
end = std::chrono::high_resolution_clock::now(); \
elapsed_seconds = std::chrono::duration<double>(end-start).count(); \
std::cout << #alg << ": \t" << elapsed_seconds << "s" << std::endl; //
int main() {
//~ const int n = 1536;
const int n = 512;
//~ const int n = 2;
matrix_type b(n,n);
matrix_type a(n,n);