Commit a834eeab authored by novatig's avatar novatig
Browse files

merged

parents ea0c6594 4b91f402
...@@ -391,7 +391,7 @@ void gemm_strass( ...@@ -391,7 +391,7 @@ void gemm_strass(
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// vectorisation // vectorisation for just AVX2
void gemm_avx( void gemm_avx(
matrix_type const& a matrix_type const& a
...@@ -399,27 +399,43 @@ void gemm_avx( ...@@ -399,27 +399,43 @@ void gemm_avx(
, matrix_type& c , matrix_type& c
){ ){
const double* A = a.data(); const double* A = a.data();
const double* B = b.data();
double* C = c.data();
const size_t n = num_rows(a); const size_t n = num_rows(a);
size_t J = 0; //~ size_t J = 0;
for (size_t i = 0; i < n; ++i) { //~ for(size_t i = 0; i < n; ++i) {
size_t step = 0; //~ size_t step = 0;
for (size_t j = 0; j < n; ++j) { //~ for(size_t j = 0; j < n; ++j) {
__m256d sum = _mm256_setzero_pd(); //~ __m256d sum = _mm256_setzero_pd();
for (size_t k = 0; k < n; k += 4) { //~ for(size_t k = 0; k < n; k += 4) {
__m256d a = _mm256_load_pd(&A[step + k]); //~ __m256d a = _mm256_load_pd(&A[step + k]);
__m256d b = _mm256_load_pd(&B[J + k]); //~ __m256d b = _mm256_load_pd(&B[J + k]);
__m256d dummy = _mm256_mul_pd(a, b); //~ __m256d dummy = _mm256_mul_pd(a, b);
sum = _mm256_add_pd(dummy, sum); //~ sum = _mm256_add_pd(dummy, sum);
} //~ }
alignas(64) double sums[4]; //~ alignas(64) double sums[4];
_mm256_store_pd(&sums[0], sum); //~ _mm256_store_pd(&sums[0], sum);
C[J + j] = sums[0] + sums[1] + sums[2] + sums[3]; //~ C[J + j] = sums[0] + sums[1] + sums[2] + sums[3];
step += n; //~ step += n;
//~ }
//~ J += n;
//~ }
double * T = new double[n];
for(size_t j = 0; j < n; ++j) {
for(size_t l = 0; l < n; ++l)
T[l] = b(l,j); // copy j-th B column
for(size_t i = 0; i < n; ++i) {
__m256d sum = _mm256_setzero_pd();
for(size_t k = 0; k < n; k+=4) {
__m256d a = _mm256_load_pd(&A[n*i+k]);
__m256d b = _mm256_load_pd(&T[k]);
__m256d tmp = _mm256_mul_pd(a, b);
sum = _mm256_add_pd(tmp, sum);
} }
J += n; alignas(64) double s[4];
_mm256_store_pd(&s[0], sum);
c(i,j) += s[0] + s[1] + s[2] + s[3];
}
} }
} }
...@@ -468,20 +484,20 @@ int main() { ...@@ -468,20 +484,20 @@ int main() {
std::generate_n( b.data(), num_rows(b)*num_cols(b), [&x]() -> double { x-=0.15; return x; }); std::generate_n( b.data(), num_rows(b)*num_cols(b), [&x]() -> double { x-=0.15; return x; });
test(gemm, a, b, c); //~ test(gemm, a, b, c);
test(gemm_array, a, b, c); //~ test(gemm_array, a, b, c);
test(gemm_ord, a, b, c); //~ test(gemm_ord, a, b, c);
test(gemm_ord2, a, b, c); //~ test(gemm_ord2, a, b, c);
test(gemm_copy, a, b, c); //~ test(gemm_copy, a, b, c);
test(gemm_exp<n>, a, b, c); //~ test(gemm_exp<n>, a, b, c);
test(gemm_ssa, a, b, c); //~ test(gemm_ssa, a, b, c);
test(gemm_block, a, b, c); //~ test(gemm_block, a, b, c);
test(gemm_block2, a, b, c); //~ test(gemm_block2, a, b, c);
test(gemm_omp, a, b, c); //~ test(gemm_omp, a, b, c);
test(gemm_omp2, a, b, c); //~ test(gemm_omp2, a, b, c);
test(gemm_strass, a, b, c); //~ test(gemm_strass, a, b, c);
test(gemm_avx, a, b, c); //~ test(gemm_avx, a, b, c);
test(gemm_blas, a, b, c); //~ test(gemm_blas, a, b, c);
return 0; return 0;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment