Commit 280671c3 authored by Donjan Rodic's avatar Donjan Rodic
Browse files

cleanup

parent e4b4178c
......@@ -403,31 +403,24 @@ void gemm_avx(
double* C = c.data();
const size_t n = num_rows(a);
size_t column = 0;
size_t J = 0;
for (size_t i = 0; i < n; ++i) {
size_t transposedRow = 0;
size_t step = 0;
for (size_t j = 0; j < n; ++j) {
__m256d sum_avx = _mm256_setzero_pd();
__m256d sum = _mm256_setzero_pd();
for (size_t k = 0; k < n; k += 4) {
//~ __m256d a = _mm256_load_pd(&At[transposedRow + k]);
__m256d a = _mm256_load_pd(&A[transposedRow + k]);
__m256d b = _mm256_load_pd(&B[column + k]);
#ifdef FMA_AVAILABLE
sum_avx = _mm256_fmadd_pd(a, b, sum_avx);
#else
//~ __m256d a = _mm256_load_pd(&At[step + k]);
__m256d a = _mm256_load_pd(&A[step + k]);
__m256d b = _mm256_load_pd(&B[J + k]);
__m256d dummy = _mm256_mul_pd(a, b);
sum_avx = _mm256_add_pd(dummy, sum_avx);
#endif
sum = _mm256_add_pd(dummy, sum);
}
alignas(64) double sums[4];
_mm256_store_pd(&sums[0], sum_avx);
double sum1 = sums[0] + sums[1];
double sum2 = sums[2] + sums[3];
double sum = sum1 + sum2;
C[column + j] = sum;
transposedRow += n;
_mm256_store_pd(&sums[0], sum);
C[J + j] = sums[0] + sums[1] + sums[2] + sums[3];
step += n;
}
column += n;
J += n;
}
}
......@@ -476,16 +469,16 @@ int main() {
std::generate_n( b.data(), num_rows(b)*num_cols(b), [&x]() -> double { x-=0.15; return x; });
//~ test(gemm, a, b, c);
test(gemm, a, b, c);
test(gemm_array, a, b, c);
//~ test(gemm_ord, a, b, c);
test(gemm_ord, a, b, c);
test(gemm_ord2, a, b, c);
test(gemm_copy, a, b, c);
test(gemm_exp<n>, a, b, c);
test(gemm_ssa, a, b, c);
//~ test(gemm_block, a, b, c);
test(gemm_block, a, b, c);
test(gemm_block2, a, b, c);
//~ test(gemm_omp, a, b, c);
test(gemm_omp, a, b, c);
test(gemm_omp2, a, b, c);
test(gemm_strass, a, b, c);
test(gemm_avx, a, b, c);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment