Commit 27127e6a authored by Panagiotis's avatar Panagiotis
Browse files

added simd examples

parent 1c1dc1cd
// Example codes for HPC course
// (c) 2012 Matthias Troyer, ETH Zurich
// workaround for missing alignas in g++-4.7 and MSVC
#ifndef HPC12_ALIGNAS_HPP
#define HPC12_ALIGNAS_HPP
#if defined(_MSC_VER) and _MSC_VER<=1700
#define alignas(A) __declspec(align(A))
#elif (__GNUC__==4) and (__GNUC_MINOR__ <= 7)
#define alignas(A) __attribute__((aligned(A)))
#endif
#endif
// Example codes for HPC course
// (c) 2012 Matthias Troyer, ETH Zurich
// workaround for missing alignas in g++-4
#include <alignas.hpp>
int alignas(32) x[16];
GCC=g++-mp-4.8
ICC=iCC
CLANG=clang++
$GCC -std=c++11 -O2 saxpy.cpp -o saxpy_gcc_novec
$ICC -no-vec -O3 -std=c++11 saxpy.cpp -o saxpy_icc_novec
$CLANG -fno-vectorize -stdlib=libc++ -std=c++11 -O3 saxpy.cpp -o saxpy_clang_novec
$CLANG -stdlib=libc++ -std=c++11 -O3 saxpy.cpp -o saxpy_clang
$GCC -ftree-vectorize -std=c++11 -O2 saxpy.cpp -o saxpy_gcc1
$GCC -std=c++11 -O3 saxpy.cpp -o saxpy_gcc2
$GCC -ftree-vectorize -std=c++11 -O2 saxpy_restrict.cpp -o saxpy_gcc3
$GCC -ftree-vectorize -std=c++11 -O2 saxpy_aligned_gcc.cpp -o saxpy_gcc4
$ICC -mavx -qopt-report=2 -qopt-report-phase=vec saxpy_restrict.cpp -o saxpy_icc1
$ICC -mavx -qopt-report=2 -qopt-report-phase=vec -O3 -restrict -std=c++11 saxpy_aligned_icc.cpp -o saxpy_icc2
$ICC -mavx -fast -O3 -std=c++11 saxpy_sse.cpp -o saxpy_sse
$ICC -mavx -fast -O3 -std=c++11 saxpy_avx.cpp -o saxpy_avx
$ICC -mavx -fast -no-vec -O0 -std=c++11 saxpy_avx.cpp -o saxpy_avx_noopt
$GCC -c -save-temps -ftree-vectorize -std=c++11 -O3 saxpy_simple_gcc.cpp
$ICC -save-temps -c -restrict -mavx -qopt-report=3 -qopt-report-phase=vec -O3 -std=c++11 saxpy_simple_icc.cpp
$ICC -save-temps -c -restrict -mavx -no-vec -O0 -std=c++11 saxpy_simple_avx.cpp
// Example codes for HPC course
// (c) 2012 Matthias Troyer, ETH Zurich
#include <vector>
#include <cassert>
#include <iostream>
#include <cmath>
#include <x86intrin.h>
#include <immintrin.h>
// a dscal assuming alignment
void dscal(int n, double a, double* x)
{
// broadcast the scale factor into a register
__m256d x0 = _mm256_broadcast_sd(&a);
// we assume alignment
std::size_t xv = *reinterpret_cast<std::size_t*>(&x);
assert(xv % 16 == 0);
int ndiv4 = n/4;
// loop over chunks of 4 values
for (int i=0; i<ndiv4; ++i) {
__m256d x1 = _mm256_load_pd(x+4*i); // aligned (fast) load
__m256d x2 = _mm256_mul_pd(x0,x1); // multiply
_mm256_store_pd(x+4*i,x2); // store back aligned
}
// do the remaining entries
for (int i=ndiv4*4 ; i< n ; ++i)
x[i] *= a;
}
int main()
{
// initialize a vector
std::vector<double> x(1000);
for (int i=0; i<x.size(); ++i)
x[i] = i;
// call sscal
std::cout << "The address is " << &x[0] << "\n";
dscal(x.size(),4.f,&x[0]);
// calculate error
double d=0.;
for (int i=0; i<x.size(); ++i)
d += std::fabs(x[i]-4.*i);
std::cout << "l1-norm of error: " << d << "\n";
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
void copy(char *cp_a, char *cp_b, int n)
{
for (int i = 0; i < n; i++)
cp_a[i] = cp_b[i];
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
void copy(char * restrict cp_a, char * restrict cp_b, int n)
{
for (int i = 0; i < n; i++)
cp_a[i] = cp_b[i];
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
void copy(char *cp_a, char *cp_b, int n)
{
#pragma ivdep
for (int i = 0; i < n; i++)
cp_a[i] = cp_b[i];
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
void test_scalar_dep(double *A, int n)
{
#ifdef TEST_GAP
#pragma loop count min (188)
for (int i=0; i<n; i++) {
double b = A[i];
if (A[i] > 0) {A[i] = 1 / A[i];}
if (A[i] > 1) {A[i] += b;}
}
#else
double b;
for (int i=0; i<n; i++) {
if (A[i] > 0) {b=A[i]; A[i] = 1 / A[i]; }
if (A[i] > 1) {A[i] += b;}
}
#endif
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
void vec_copy(float *dest, float *src, int len)
{
float ii;
#pragma simd
for (int i = 0, ii = 0.0f; i < len; i++)
dest[i] = src[i] * ii++;
}
\ No newline at end of file
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
char foo(char *A, int n){
int i;
char x = 0;
#ifdef SIMD
#pragma simd // Generates incorrect code
#endif
#ifdef REDUCTION
#pragma simd reduction(+:x) // Generates correct code
#endif
#ifdef IVDEP
#pragma ivdep
#endif
for (i=0; i<n; i++){
x = x + A[i];
}
return x;
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#define N 100000
unsigned char a[N];
char b[N];
short sat2short(unsigned char *p, char *q, int n) {
int i;
short x = 0;
#ifdef SIMD
#pragma simd reduction(+:x)
#endif
for (i=0; i<n; i++) {
x = std::max(std::min(x + p[i]*q[i],32767),-32768);
}
return x;
}
int main(int argc, char **argv) {
short x = 0;
const __int64 startTime = __rdtsc();
for (int i=0; i<32767; i++) {
a[i] = 1;
b[i] = 1;
}
#ifdef SIMD
#pragma simd vectorlength(16)
#endif
for (int i=0; i<32767; i++) {
if (i >= 16 && i < 32767) {
b[i] = b[i-16] - 1;
}
printf("b[%d] = \n", b[i]);
}
x = sat2short(&a[0], &b[0], 32767);
const __int64 elapsedTime = __rdtsc() - startTime;
const double elapsedTimeInSecs = (double)elapsedTime / 1000000.0;
if (x==30847) {
printf("passed x = %d, time = %lf \n", x, elapsedTimeInSecs);
}
else {
printf("failed x = %d, time = %lf \n", x, elapsedTimeInSecs);
exit(-1);
}
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
void foo(int *A, int *B, int *restrict C, int n){
int i;
int t = 0;
#ifdef PRIVATE
#pragma simd private(t)
#endif
for (i=0; i<n; i++){
t = 0.;
if (A[i] > 0) {
t = A[i];
}
if (B[i] < 0) {
t = B[i];
}
C[i] = t;
}
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2012 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#include <stdio.h>
#include <stdlib.h>
const int N = 1024;
int a[N], b[N], c[N];
void init() {
for (int i = 0; i < N; i++) a[i] = i;
}
__declspec(noinline)
__declspec(vector)
int vfun_add_one(int x)
{
return x+1;
}
__declspec(noinline)
int checksum()
{
int i;
int sum = 0;
for (i = 0; i < N; i++) {
sum += a[i];
}
return sum;
}
__declspec(noinline)
void d5() {
int h,g = 0;
init();
#pragma simd
for (h = 0; h < N; h++) {
a[h] = vfun_add_one(a[h]);
b[h] = c[h] - a[h];
c[h] = a[h] - b[h];
}
g = checksum();
printf("SIMD for loop: passed %d\n",g);
}
int main(int argc, char *argv[])
{
int g = 0, h = 0;
d5();
return 0;
}
// Example codes for HPC course
// (c) 2012 Matthias Troyer, ETH Zurich
// adapated from code by Wesley P. Petersen published first in
// Petersen and Arbenz "Intro. to Parallel Computing," Oxford Univ. Press, 2004
#include <iostream>
#include <cmath>
#include <vector>
#include <cassert>
#include "xmmintrin.h"
int isamax0(int n, float *x)
{
// assume alignment
assert( ((std::size_t) x % 16) == 0);
__m128 V7 = _mm_set_ps(3.0,2.0,1.0,0.0);
__m128 V2 = _mm_set_ps(3.0,2.0,1.0,0.0);
__m128 V6 = _mm_set_ps1(-0.0);
__m128 offset4 = _mm_set_ps1(4.0);
__m128 V3;
float *xp=x;
int nsegs = (n >> 2) - 2;
int eres = n - 4*(nsegs+2);
__m128 V0 = _mm_load_ps(xp); xp += 4; // first four in 4/time seq.
__m128 V1 = _mm_load_ps(xp); xp += 4; // next four in 4/time seq.
V0 = _mm_andnot_ps(V6,V0); // take absolute value
for(int i=0; i<nsegs; i++){
V1 = _mm_andnot_ps(V6,V1); // take absolute value
V3 = _mm_cmpnle_ps(V1,V0); // compare old max of 4 to new
int mb = _mm_movemask_ps(V3); // any of 4 bigger?
V2 = _mm_add_ps(V2,offset4); // add offset
if(mb > 0){
V0 = _mm_max_ps(V0,V1); // get the new maxima
V3 = _mm_and_ps(V2,V3); // the index if the element was bigger or 0 otherwise
V7 = _mm_max_ps(V7,V3); // get the maxima of the old and new indices
}
V1 = _mm_load_ps(xp); xp += 4; // bottom load next four
}
// finish up the last segment of 4
V1 = _mm_andnot_ps(V6,V1); // take absolute value
V3 = _mm_cmpnle_ps(V1,V0); // compare old max of 4 to new
int mb = _mm_movemask_ps(V3); // any of 4 bigger?
V2 = _mm_add_ps(V2,offset4); // add offset
if(mb > 0){
V0 = _mm_max_ps(V0,V1);
V3 = _mm_and_ps(V2,V3);
V7 = _mm_max_ps(V7,V3);
}
// Now finish up: segment maxima are in V0, indices in V7
float xbig[8], indx[8];
_mm_store_ps(xbig,V0);
_mm_store_ps(indx,V7);
// add remaining numbers
if(eres>0){
for(int i=0; i<eres; i++){
xbig[4+i] = fabsf(*(xp++));
indx[4+i] = (float) (n+i);
}
}
float ebig = 0.;
int iebig = 0;
for(int i=0; i<4+eres; i++){
if(xbig[i] > ebig){
ebig = xbig[i];
iebig = (int) indx[i];
}
}
return iebig;
}
main()
{
std::vector<float> x(20);
for(int i=0; i<x.size(); i++)
x[i] = -2.0 + i;
x[17] =33500.0;
int im = isamax0(x.size(),&x[0]);
std::cout << " maximum index = " << im << "\n";
std::cout << " maximum value = " << x[im] << "\n";
}
#!/bin/bash
function bench_run()
{
echo "Running $1"
$1
}
function bench_select()
{
if [ "$1" = "intel" ]; then
bench_run ./saxpy_icc_novec
bench_run ./saxpy_icc1
bench_run ./saxpy_icc2
bench_run ./saxpy_sse
bench_run ./saxpy_avx
bench_run ./saxpy_avx_noopt
elif [ "$1" = "gcc" ]; then
bench_run ./saxpy_gcc_novec
bench_run ./saxpy_gcc1
bench_run ./saxpy_gcc2
bench_run ./saxpy_gcc3
bench_run ./saxpy_gcc4
fi
}
for type in ${@}; do
echo "Benchmarks type: $type."
bench_select $type
done
// Example codes for HPC course
// (c) 2012 Matthias Troyer, ETH Zurich
#include <vector>
#include <iostream>
#include <cmath>
#include <cassert>
#include <chrono>
#define N 100000
#define REPETITIONS 1000
// a saxpy version without explicit SSE
void saxpy(int n, float a, float* x, float* y)
{
for (int i=0; i<n; ++i)
y[i] += a*x[i];
}
int main()
{
// initialize two vectors
std::vector<float> x(N);
std::vector<float> y(N);
for (int i=0; i<x.size(); ++i)
x[i] = 1.;
for (int i=0; i<y.size(); ++i)
y[i] = 2.;
// call saxpy and time it
std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
start = std::chrono::high_resolution_clock::now();
for (int it=0; it<REPETITIONS; ++it )
saxpy(x.size(),4.f/REPETITIONS,&x[0],&y[0]);
end = std::chrono::high_resolution_clock::now();
int elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(end-start).count();
// calculate error
float d=0.;
for (int i=0; i<x.size(); ++i)
d += std::fabs(y[i]-6.);
std::cout << "elapsed time: " << elapsed_time/REPETITIONS << "mus\n";
std::cout << "l1-norm of error: " << d << "\n";
}
// Example codes for HPC course