Commit b7d27dae authored by Donjan Rodic's avatar Donjan Rodic
Browse files

cleanup 3

parent 25801813
......@@ -391,7 +391,7 @@ void gemm_strass(
}
////////////////////////////////////////////////////////////////////////////////
// vectorisation
// vectorisation for just AVX2
void gemm_avx(
matrix_type const& a
......@@ -399,27 +399,43 @@ void gemm_avx(
, matrix_type& c
){
const double* A = a.data();
const double* B = b.data();
double* C = c.data();
const size_t n = num_rows(a);
size_t J = 0;
for (size_t i = 0; i < n; ++i) {
size_t step = 0;
for (size_t j = 0; j < n; ++j) {
//~ size_t J = 0;
//~ for(size_t i = 0; i < n; ++i) {
//~ size_t step = 0;
//~ for(size_t j = 0; j < n; ++j) {
//~ __m256d sum = _mm256_setzero_pd();
//~ for(size_t k = 0; k < n; k += 4) {
//~ __m256d a = _mm256_load_pd(&A[step + k]);
//~ __m256d b = _mm256_load_pd(&B[J + k]);
//~ __m256d dummy = _mm256_mul_pd(a, b);
//~ sum = _mm256_add_pd(dummy, sum);
//~ }
//~ alignas(64) double sums[4];
//~ _mm256_store_pd(&sums[0], sum);
//~ C[J + j] = sums[0] + sums[1] + sums[2] + sums[3];
//~ step += n;
//~ }
//~ J += n;
//~ }
double * T = new double[n];
for(size_t j = 0; j < n; ++j) {
for(size_t l = 0; l < n; ++l)
T[l] = b(l,j); // copy j-th B column
for(size_t i = 0; i < n; ++i) {
__m256d sum = _mm256_setzero_pd();
for (size_t k = 0; k < n; k += 4) {
__m256d a = _mm256_load_pd(&A[step + k]);
__m256d b = _mm256_load_pd(&B[J + k]);
__m256d dummy = _mm256_mul_pd(a, b);
sum = _mm256_add_pd(dummy, sum);
for(size_t k = 0; k < n; k+=4) {
__m256d a = _mm256_load_pd(&A[n*i+k]);
__m256d b = _mm256_load_pd(&T[k]);
__m256d tmp = _mm256_mul_pd(a, b);
sum = _mm256_add_pd(tmp, sum);
}
alignas(64) double sums[4];
_mm256_store_pd(&sums[0], sum);
C[J + j] = sums[0] + sums[1] + sums[2] + sums[3];
step += n;
alignas(64) double s[4];
_mm256_store_pd(&s[0], sum);
c(i,j) += s[0] + s[1] + s[2] + s[3];
}
J += n;
}
}
......@@ -468,20 +484,20 @@ int main() {
std::generate_n( b.data(), num_rows(b)*num_cols(b), [&x]() -> double { x-=0.15; return x; });
test(gemm, a, b, c);
test(gemm_array, a, b, c);
test(gemm_ord, a, b, c);
test(gemm_ord2, a, b, c);
test(gemm_copy, a, b, c);
test(gemm_exp<n>, a, b, c);
test(gemm_ssa, a, b, c);
test(gemm_block, a, b, c);
test(gemm_block2, a, b, c);
test(gemm_omp, a, b, c);
test(gemm_omp2, a, b, c);
test(gemm_strass, a, b, c);
test(gemm_avx, a, b, c);
test(gemm_blas, a, b, c);
//~ test(gemm, a, b, c);
//~ test(gemm_array, a, b, c);
//~ test(gemm_ord, a, b, c);
//~ test(gemm_ord2, a, b, c);
//~ test(gemm_copy, a, b, c);
//~ test(gemm_exp<n>, a, b, c);
//~ test(gemm_ssa, a, b, c);
//~ test(gemm_block, a, b, c);
//~ test(gemm_block2, a, b, c);
//~ test(gemm_omp, a, b, c);
//~ test(gemm_omp2, a, b, c);
//~ test(gemm_strass, a, b, c);
//~ test(gemm_avx, a, b, c);
//~ test(gemm_blas, a, b, c);
return 0;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment