Commit 1fc1e398 authored by Panagiotis's avatar Panagiotis
Browse files

Merge branch 'master' of gitlab.ethz.ch:hpcse_hs16/lecture

parents ffe61298 663299fb
/*
* bandwidth.cpp
* Copyright 2016 ETH Zurich. All rights reserved.
*
*/
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <cstring>
#include <map>
#include <algorithm>
#include <vector>
#include <string>
#include <cassert>
#include <sys/time.h>
#if defined(_OPENMP)
#include <omp.h>
#endif
#include <numa.h>
#include "ArgumentParser.hpp"
using namespace std;
//========================================================================================
//
// Benchmark
//
//========================================================================================
double mysecond() {
#if defined(_OPENMP)
return omp_get_wtime();
#else
struct timeval tp;
struct timezone tzp;
int i;
i = gettimeofday(&tp,&tzp);
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
#endif
}
void mycopy(const float* const a, float* const b, const int N) {
#if !defined(_RANGE_)
#pragma omp parallel for
#endif
for (int j=0; j<N; j++)
b[j] = a[j];
}
void myadd(const float* const a, float* const b, float* const c, const int N) {
#if !defined(_RANGE_)
#pragma omp parallel for
#endif
for (int j=0; j<N; j++)
b[j] = a[j]+c[j];
}
vector<double> benchmark(int N, size_t NTIMES) {
#if !defined(_USE_NUMA_)
#define numa_alloc_interleaved malloc
#define numa_free(x,y) free(x)
#endif
float * a = (float*)numa_alloc_interleaved(sizeof(float)*N);
float * b = (float*)numa_alloc_interleaved(sizeof(float)*N);
#if defined(_TRIADD_)
float * c = (float*)numa_alloc_interleaved(sizeof(float)*N);
#endif
// check that the arrays are correctly allocated
if (a == NULL || ((size_t)a & 0xf) != 0) {
cout << "a is NULL! aborting" << endl;
abort();
}
if (b == NULL || ((size_t)b & 0xf) != 0) {
cout << "b is NULL! aborting" << endl;
abort();
}
#if defined(_TRIADD_)
if (c == NULL || ((size_t)c & 0xf) != 0) {
cout << "c is NULL! aborting" << endl;
abort();
}
#endif
// fill the arrays
#if !defined(_RANGE_)
#pragma omp parallel for
#endif
for (int j=0; j<N; j++) {
a[j] = 2.0;
b[j] = 1.0;
#if defined(_TRIADD_)
c[j] = 1.0;
#endif
}
// vector containing timing of each sample
vector<double> result(NTIMES);
// run benchmark
for (uint j=0; j<NTIMES; ++j) {
const double tstart = mysecond();
#if defined(_TRIADD_)
myadd(a, b, c, N);
#else
mycopy(a, b, N);
#endif
const double tend = mysecond();
result[j] = tend - tstart;
swap(a, b);
}
sort(result.begin(), result.end());
numa_free(a, sizeof(float)*N);
numa_free(b, sizeof(float)*N);
#if defined(_TRIADD_)
numa_free(c, sizeof(float)*N);
#endif
return result;
}
void driver(const int N, int NTIMES, const int wordsize) {
// running benchmarks
{
vector<double> timings;
timings = benchmark(N, NTIMES);
#if defined(_TRIADD_)
const double denom = 1.0E-9 * 3. * wordsize * N;
#else
const double denom = 1.0E-9 * 2. * wordsize * N;
#endif
if (timings.size() == 0) {
cout << "Error: 0 Samples collected. Aborting.\n";
abort();
}
static bool first = true;
if (first) {
cout << "Footprint [KB]\t\tN\t\tBandwidth[GB/s]\n";
first = false;
}
// we look at the 50th percentile
const int i2 = 0.50*NTIMES;
#if defined(_TRIADD_)
cout << 3*wordsize*N/1024. << "\t\t" << N << "\t\t" << denom/(timings[i2]) << endl;
#else
cout << 2*wordsize*N/1024. << "\t\t" << N << "\t\t" << denom/(timings[i2]) << endl;
#endif
}
}
int main(int argc, const char * argv[]) {
ArgumentParser arg(argc,argv);
const bool RANGE = arg.get("range", false);
const int N = arg.get("size", 40'000'000); // ticks require c++14
size_t NTIMES = arg.get("iter", 100);
const int wordsize = sizeof(float);
const float KB = 2.*wordsize*N/1024.;
if (KB < 2) {
cout << "This size is to small! aborting. " << KB << " KB\n" ;
abort();
}
// running benchmarks
if(!RANGE)
driver(16*((N+15)/16), NTIMES, wordsize);
else {
double myN = N;
const size_t NEND = N*5e5;
while (myN < NEND) {
driver(16*(int)((myN+15)/16), NTIMES, wordsize);
// this factor is arbitrary and can be tuned
myN *= 1.05;
}
}
}
...@@ -76,32 +76,32 @@ void ComputePower(double * s, int iSize) { ...@@ -76,32 +76,32 @@ void ComputePower(double * s, int iSize) {
// compute independently 8 polynomial evaluations with horner's scheme of degree 4N // compute independently 8 polynomial evaluations with horner's scheme of degree 4N
// 4*8*2*N FLOP // 4*8*2*N FLOP
for (int i=0; i<N; i++) { for (int i=0; i<N; i++) {
double gx0 = MULADD(b0, x0, a0); const double gx0 = MULADD(b0, x0, a0);
double gy0 = MULADD(b0, y0, a0); const double gy0 = MULADD(b0, y0, a0);
double gz0 = MULADD(b0, z0, a0); const double gz0 = MULADD(b0, z0, a0);
double gw0 = MULADD(b0, w0, a0); const double gw0 = MULADD(b0, w0, a0);
double gr0 = MULADD(b0, r0, a0); const double gr0 = MULADD(b0, r0, a0);
double gt0 = MULADD(b0, t0, a0); const double gt0 = MULADD(b0, t0, a0);
double gu0 = MULADD(b0, u0, a0); const double gu0 = MULADD(b0, u0, a0);
double gv0 = MULADD(b0, v0, a0); const double gv0 = MULADD(b0, v0, a0);
double tx0 = MULADD(b1, gx0, a1); const double tx0 = MULADD(b1, gx0, a1);
double ty0 = MULADD(b1, gy0, a1); const double ty0 = MULADD(b1, gy0, a1);
double tz0 = MULADD(b1, gz0, a1); const double tz0 = MULADD(b1, gz0, a1);
double tw0 = MULADD(b1, gw0, a1); const double tw0 = MULADD(b1, gw0, a1);
double tr0 = MULADD(b1, gr0, a1); const double tr0 = MULADD(b1, gr0, a1);
double tt0 = MULADD(b1, gt0, a1); const double tt0 = MULADD(b1, gt0, a1);
double tu0 = MULADD(b1, gu0, a1); const double tu0 = MULADD(b1, gu0, a1);
double tv0 = MULADD(b1, gv0, a1); const double tv0 = MULADD(b1, gv0, a1);
double ux0 = MULADD(b2, tx0, a2); const double ux0 = MULADD(b2, tx0, a2);
double uy0 = MULADD(b2, ty0, a2); const double uy0 = MULADD(b2, ty0, a2);
double uz0 = MULADD(b2, tz0, a2); const double uz0 = MULADD(b2, tz0, a2);
double uw0 = MULADD(b2, tw0, a2); const double uw0 = MULADD(b2, tw0, a2);
double ur0 = MULADD(b2, tr0, a2); const double ur0 = MULADD(b2, tr0, a2);
double ut0 = MULADD(b2, tt0, a2); const double ut0 = MULADD(b2, tt0, a2);
double uu0 = MULADD(b2, tu0, a2); const double uu0 = MULADD(b2, tu0, a2);
double uv0 = MULADD(b2, tv0, a2); const double uv0 = MULADD(b2, tv0, a2);
x0 = MULADD(b3, ux0, a3); x0 = MULADD(b3, ux0, a3);
y0 = MULADD(b3, uy0, a3); y0 = MULADD(b3, uy0, a3);
......
CXX=g++
CXXFLAGS=-Wall -std=c++11 -O2
all: sum_serial sum_avx sum_auto
clean:
rm -f sum_serial sum_avx sum_auto
sum_serial: sum_serial.cpp
$(CXX) $(CXXFLAGS) -o $@ $<
sum_avx: sum_avx.cpp
# $(CXX) $(CXXFLAGS) -mavx -Wa,-q -o $@ $< # for mac
$(CXX) $(CXXFLAGS) -mavx -o $@ $< # for linux
sum_auto: sum_serial.cpp
$(CXX) $(CXXFLAGS) -mavx -ftree-vectorize -o $@ $<
// Example codes for HPCSE course
// (c) 2012 Andreas Hehn, ETH Zurich
#ifndef HPCSE_ALIGNED_ALLOCATOR_HPP
#define HPCSE_ALIGNED_ALLOCATOR_HPP
#ifdef _WIN32
#include <malloc.h>
#else
#include <cstdlib>
#endif
#if __cplusplus >= 201103L
#define NOEXCEPT_SPEC noexcept
#else
#define NOEXCEPT_SPEC
#endif
namespace hpcse {
// Alignment must be a power of 2 !
template <typename T, unsigned int Alignment>
class aligned_allocator {
public:
typedef T* pointer;
typedef T const* const_pointer;
typedef T& reference;
typedef T const& const_reference;
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
template <typename U>
struct rebind {
typedef aligned_allocator<U,Alignment> other;
};
aligned_allocator() NOEXCEPT_SPEC {
}
aligned_allocator(aligned_allocator const& a) NOEXCEPT_SPEC {
}
template <typename U>
aligned_allocator(aligned_allocator<U,Alignment> const& b) NOEXCEPT_SPEC {
}
pointer allocate(size_type n) {
pointer p;
#ifdef _WIN32
p = reinterpret_cast<pointer>(_aligned_malloc(n*sizeof(T), Alignment));
if(p == 0)
throw std::bad_alloc();
#else
if(posix_memalign(reinterpret_cast<void**>(&p), Alignment, n * sizeof(T) ))
throw std::bad_alloc();
#endif
return p;
}
void deallocate(pointer p, size_type n) NOEXCEPT_SPEC {
#ifdef _WIN32
_aligned_free(p);
#else
std::free(p);
#endif
}
size_type max_size() const NOEXCEPT_SPEC {
std::allocator<T> a;
return a.max_size();
}
#if __cplusplus >= 201103L
template <typename C, class... Args>
void construct(C* c, Args&&... args) {
new ((void*)c) C(std::forward<Args>(args)...);
}
#else
void construct(pointer p, const_reference t) {
new((void *)p) T(t);
}
#endif
template <typename C>
void destroy(C* c) {
c->~C();
}
bool operator == (aligned_allocator const& a2) const NOEXCEPT_SPEC {
return true;
}
bool operator != (aligned_allocator const& a2) const NOEXCEPT_SPEC {
return false;
}
template <typename U, unsigned int UAlignment>
bool operator == (aligned_allocator<U,UAlignment> const& b) const NOEXCEPT_SPEC {
return false;
}
template <typename U, unsigned int UAlignment>
bool operator != (aligned_allocator<U,UAlignment> const& b) const NOEXCEPT_SPEC {
return true;
}
};
}
#undef NOEXPECT_SPEC
#endif //HPCSE_ALIGNED_ALLOCATOR_HPP
#include <vector>
#include <numeric>
#include <iostream>
#include <x86intrin.h>
#include "timer.hpp"
#include "aligned_allocator.hpp"
int main( int argc, char** argv )
{
// repetitions
const int nrep = 10000;
// vector size
const int n = 1<<18;
// initialize 64 byte aligned vectors
std::vector< float, hpcse::aligned_allocator<float,64> > x(n,-1.2), y(n,3.4), z(n);
hpcse::timer<> tim;
tim.start();
for (int k = 0; k < nrep; ++k)
{
const __m256 f2 = _mm256_set1_ps( 2.f );
for( int i = 0; i < n; i += 8 )
{
// z[i] = x[i]*x[i] + y[i]*y[i] + 2.*x[i]*y[i];
const __m256 xx = _mm256_load_ps( &x[i] );
const __m256 yy = _mm256_load_ps( &y[i] );
const __m256 z1 = _mm256_mul_ps( xx, xx );
const __m256 z2 = _mm256_mul_ps( yy, yy );
const __m256 z3 = _mm256_mul_ps( xx, yy );
const __m256 z4 = _mm256_mul_ps( f2, z3 );
const __m256 zz = _mm256_add_ps( z1, z2 );
const __m256 res = _mm256_add_ps( zz, z4 );
// Store back the result.
// Since the result won't be used any time soon it is useless to
// store it in the caches. It would only pollute the caches. Hence
// we could store it back to the RAM bypassing the cache-hierarchy by
// using _mm256_stream_ps instead of the regular _mm256_store_ps.
// However, _mm256_store_ps is faster for such small sizes.
_mm256_store_ps( &z[i], res );
}
}
tim.stop();
// print result checksum
std::cout << std::accumulate(z.begin(), z.end(), 0.) << std::endl;
std::cout << "Task took " << tim.get_timing() << " seconds." << std::endl;
}
#include <vector>
#include <numeric>
#include <iostream>
#include "timer.hpp"
#include "aligned_allocator.hpp"
int main( int argc, char** argv )
{
// repetitions
const int nrep = 10000;
// vector size
const int n = 1<<18;
// initialize 64 byte aligned vectors
std::vector< float, hpcse::aligned_allocator<float,64> > x(n,-1.2), y(n,3.4), z(n);
hpcse::timer<> tim;
tim.start();
for (int k = 0; k < nrep; ++k)
{
for( int i = 0; i < n; ++i )
{
z[i] = x[i]*x[i] + y[i]*y[i] + 2.*x[i]*y[i];
}
}
tim.stop();
// print result checksum
std::cout << std::accumulate(z.begin(), z.end(), 0.) << std::endl;
std::cout << "Task took " << tim.get_timing() << " seconds." << std::endl;
}
// Example codes for HPC course
// (c) 2012-2013 Andreas Hehn, ETH Zurich
#ifndef HPCSE_TIMER_HPP
#define HPCSE_TIMER_HPP
#include <chrono>
namespace hpcse
{
template <typename Clock = std::chrono::high_resolution_clock>
class timer {
public:
inline void start() {
start_time = Clock::now();
}
inline void stop() {
end_time = Clock::now();
}
double get_timing() const {
return std::chrono::nanoseconds(end_time - start_time).count()*1e-9;
}
private:
std::chrono::time_point<Clock> start_time, end_time;
};
}
#endif // HPCSE_TIMER_HPP
/* File: LennardJones.h */
/* Copyright 2015 ETH Zurich. All Rights Reserved. */
#ifndef LENNARD_JONES_H_AHQBPV5S
#define LENNARD_JONES_H_AHQBPV5S
#include <cassert>
#include <numeric>
#include <utility>
#include "types.h"
/* TODO: Step 0: Include appropriate header to access AVX intrinsics */
#include <x86intrin.h>
class LennardJones
{
public:
LennardJones(const value_type rm, const value_type epsilon)
: rm_sq_(rm*rm), eps_(epsilon)
{ }
// sequential
inline value_type diff(configuration_type const& c, std::pair<value_type,value_type> const& newpos) const
{
assert( c.first.size() == c.second.size() );
const std::size_t N = c.first.size()-1;
value_type dE = 0.0;
for(std::size_t i=0; i < N; ++i){
const value_type dx = c.first[N] - c.first[i];
const value_type new_dx = newpos.first - c.first[i];
const value_type dy = c.second[N] - c.second[i];
const value_type new_dy = newpos.second - c.second[i];
const value_type rm_d_sq = rm_sq_ / (dx*dx + dy*dy);
const value_type new_rm_d_sq = rm_sq_ / (new_dx*new_dx + new_dy*new_dy);
const value_type rm_d_6th = rm_d_sq*rm_d_sq*rm_d_sq;
const value_type new_rm_d_6th = new_rm_d_sq*new_rm_d_sq*new_rm_d_sq;
dE += eps_*((new_rm_d_6th*new_rm_d_6th - 2*new_rm_d_6th) - (rm_d_6th*rm_d_6th - 2*rm_d_6th));
}
return dE;
}
#if defined(_AVX256_)
inline value_type diff_avx(configuration_type const& c, std::pair<value_type,value_type> const& newpos) const
{
assert( c.first.size() == c.second.size() );
const std::size_t N = c.first.size()-1;
/* TODO: Step 1: */
const __m256 mmrm_sq = _mm256_set1_ps(rm_sq_);
const __m256 mmeps = _mm256_set1_ps(eps_);
const __m256 mmxp = _mm256_set1_ps(c.first[N]);
const __m256 mmyp = _mm256_set1_ps(c.second[N]);
const __m256 mmxnew = _mm256_set1_ps(newpos.first);
const __m256 mmynew = _mm256_set1_ps(newpos.second);
const __m256 mm2 = _mm256_set1_ps(-2.0f);
/* TODO: Step 2: */
__m256 mmdE = _mm256_setzero_ps();
/* TODO: Step 3: */
const unsigned r_width = 8;
for(std::size_t i=0; i < N/r_width; ++i){
const __m256 mmxi = _mm256_load_ps(&c.first[i*r_width]);
const __m256 mmyi = _mm256_load_ps(&c.second[i*r_width]);
const __m256 dx = _mm256_sub_ps(mmxp,mmxi);
const __m256 new_dx = _mm256_sub_ps(mmxnew,mmxi);
const __m256 dy = _mm256_sub_ps(mmyp,mmyi);
const __m256 new_dy = _mm256_sub_ps(mmynew,mmyi);
#ifdef _ACCURATE_DIV_
const __m256 rm_d_sq =
_mm256_div_ps(mmrm_sq,_mm256_add_ps(_mm256_mul_ps(dx,dx),_mm256_mul_ps(dy,dy)));
const __m256 new_rm_d_sq =
_mm256_div_ps(mmrm_sq,_mm256_add_ps(_mm256_mul_ps(new_dx,new_dx),_mm256_mul_ps(new_dy,new_dy)));
#else
const __m256 rm_d_sq =
_mm256_mul_ps(mmrm_sq,_mm256_rcp_ps(_mm256_add_ps(_mm256_mul_ps(<