Commit 757bab77 authored by Donjan Rodic's avatar Donjan Rodic
Browse files

Merge branch 'master' of gitlab.ethz.ch:hpcse_hs16/lecture

parents 5874d1b9 277d0cb5
CXX=g++
CXXFLAGS=-Wall -std=c++11 -O2
all: sum_serial sum_avx sum_auto
clean:
rm -f sum_serial sum_avx sum_auto
sum_serial: sum_serial.cpp
$(CXX) $(CXXFLAGS) -o $@ $<
sum_avx: sum_avx.cpp
# $(CXX) $(CXXFLAGS) -mavx -Wa,-q -o $@ $< # for mac
$(CXX) $(CXXFLAGS) -mavx -o $@ $< # for linux
sum_auto: sum_serial.cpp
$(CXX) $(CXXFLAGS) -mavx -ftree-vectorize -o $@ $<
// Example codes for HPCSE course
// (c) 2012 Andreas Hehn, ETH Zurich
#ifndef HPCSE_ALIGNED_ALLOCATOR_HPP
#define HPCSE_ALIGNED_ALLOCATOR_HPP
#ifdef _WIN32
#include <malloc.h>
#else
#include <cstdlib>
#endif
#if __cplusplus >= 201103L
#define NOEXCEPT_SPEC noexcept
#else
#define NOEXCEPT_SPEC
#endif
namespace hpcse {
// Alignment must be a power of 2 !
template <typename T, unsigned int Alignment>
class aligned_allocator {
public:
typedef T* pointer;
typedef T const* const_pointer;
typedef T& reference;
typedef T const& const_reference;
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
template <typename U>
struct rebind {
typedef aligned_allocator<U,Alignment> other;
};
aligned_allocator() NOEXCEPT_SPEC {
}
aligned_allocator(aligned_allocator const& a) NOEXCEPT_SPEC {
}
template <typename U>
aligned_allocator(aligned_allocator<U,Alignment> const& b) NOEXCEPT_SPEC {
}
pointer allocate(size_type n) {
pointer p;
#ifdef _WIN32
p = reinterpret_cast<pointer>(_aligned_malloc(n*sizeof(T), Alignment));
if(p == 0)
throw std::bad_alloc();
#else
if(posix_memalign(reinterpret_cast<void**>(&p), Alignment, n * sizeof(T) ))
throw std::bad_alloc();
#endif
return p;
}
void deallocate(pointer p, size_type n) NOEXCEPT_SPEC {
#ifdef _WIN32
_aligned_free(p);
#else
std::free(p);
#endif
}
size_type max_size() const NOEXCEPT_SPEC {
std::allocator<T> a;
return a.max_size();
}
#if __cplusplus >= 201103L
template <typename C, class... Args>
void construct(C* c, Args&&... args) {
new ((void*)c) C(std::forward<Args>(args)...);
}
#else
void construct(pointer p, const_reference t) {
new((void *)p) T(t);
}
#endif
template <typename C>
void destroy(C* c) {
c->~C();
}
bool operator == (aligned_allocator const& a2) const NOEXCEPT_SPEC {
return true;
}
bool operator != (aligned_allocator const& a2) const NOEXCEPT_SPEC {
return false;
}
template <typename U, unsigned int UAlignment>
bool operator == (aligned_allocator<U,UAlignment> const& b) const NOEXCEPT_SPEC {
return false;
}
template <typename U, unsigned int UAlignment>
bool operator != (aligned_allocator<U,UAlignment> const& b) const NOEXCEPT_SPEC {
return true;
}
};
}
#undef NOEXPECT_SPEC
#endif //HPCSE_ALIGNED_ALLOCATOR_HPP
#include <vector>
#include <numeric>
#include <iostream>
#include <x86intrin.h>
#include "timer.hpp"
#include "aligned_allocator.hpp"
int main( int argc, char** argv )
{
// repetitions
const int nrep = 10000;
// vector size
const int n = 1<<18;
// initialize 64 byte aligned vectors
std::vector< float, hpcse::aligned_allocator<float,64> > x(n,-1.2), y(n,3.4), z(n);
hpcse::timer<> tim;
tim.start();
for (int k = 0; k < nrep; ++k)
{
const __m256 f2 = _mm256_set1_ps( 2.f );
for( int i = 0; i < n; i += 8 )
{
// z[i] = x[i]*x[i] + y[i]*y[i] + 2.*x[i]*y[i];
const __m256 xx = _mm256_load_ps( &x[i] );
const __m256 yy = _mm256_load_ps( &y[i] );
const __m256 z1 = _mm256_mul_ps( xx, xx );
const __m256 z2 = _mm256_mul_ps( yy, yy );
const __m256 z3 = _mm256_mul_ps( xx, yy );
const __m256 z4 = _mm256_mul_ps( f2, z3 );
const __m256 zz = _mm256_add_ps( z1, z2 );
const __m256 res = _mm256_add_ps( zz, z4 );
// Store back the result.
// Since the result won't be used any time soon it is useless to
// store it in the caches. It would only pollute the caches. Hence
// we could store it back to the RAM bypassing the cache-hierarchy by
// using _mm256_stream_ps instead of the regular _mm256_store_ps.
// However, _mm256_store_ps is faster for such small sizes.
_mm256_store_ps( &z[i], res );
}
}
tim.stop();
// print result checksum
std::cout << std::accumulate(z.begin(), z.end(), 0.) << std::endl;
std::cout << "Task took " << tim.get_timing() << " seconds." << std::endl;
}
#include <vector>
#include <numeric>
#include <iostream>
#include "timer.hpp"
#include "aligned_allocator.hpp"
int main( int argc, char** argv )
{
// repetitions
const int nrep = 10000;
// vector size
const int n = 1<<18;
// initialize 64 byte aligned vectors
std::vector< float, hpcse::aligned_allocator<float,64> > x(n,-1.2), y(n,3.4), z(n);
hpcse::timer<> tim;
tim.start();
for (int k = 0; k < nrep; ++k)
{
for( int i = 0; i < n; ++i )
{
z[i] = x[i]*x[i] + y[i]*y[i] + 2.*x[i]*y[i];
}
}
tim.stop();
// print result checksum
std::cout << std::accumulate(z.begin(), z.end(), 0.) << std::endl;
std::cout << "Task took " << tim.get_timing() << " seconds." << std::endl;
}
// Example codes for HPC course
// (c) 2012-2013 Andreas Hehn, ETH Zurich
#ifndef HPCSE_TIMER_HPP
#define HPCSE_TIMER_HPP
#include <chrono>
namespace hpcse
{
template <typename Clock = std::chrono::high_resolution_clock>
class timer {
public:
inline void start() {
start_time = Clock::now();
}
inline void stop() {
end_time = Clock::now();
}
double get_timing() const {
return std::chrono::nanoseconds(end_time - start_time).count()*1e-9;
}
private:
std::chrono::time_point<Clock> start_time, end_time;
};
}
#endif // HPCSE_TIMER_HPP
/* File: LennardJones.h */
/* Copyright 2015 ETH Zurich. All Rights Reserved. */
#ifndef LENNARD_JONES_H_AHQBPV5S
#define LENNARD_JONES_H_AHQBPV5S
#include <cassert>
#include <numeric>
#include <utility>
#include "types.h"
/* TODO: Step 0: Include appropriate header to access AVX intrinsics */
#include <x86intrin.h>
class LennardJones
{
public:
LennardJones(const value_type rm, const value_type epsilon)
: rm_sq_(rm*rm), eps_(epsilon)
{ }
// sequential
inline value_type diff(configuration_type const& c, std::pair<value_type,value_type> const& newpos) const
{
assert( c.first.size() == c.second.size() );
const std::size_t N = c.first.size()-1;
value_type dE = 0.0;
for(std::size_t i=0; i < N; ++i){
const value_type dx = c.first[N] - c.first[i];
const value_type new_dx = newpos.first - c.first[i];
const value_type dy = c.second[N] - c.second[i];
const value_type new_dy = newpos.second - c.second[i];
const value_type rm_d_sq = rm_sq_ / (dx*dx + dy*dy);
const value_type new_rm_d_sq = rm_sq_ / (new_dx*new_dx + new_dy*new_dy);
const value_type rm_d_6th = rm_d_sq*rm_d_sq*rm_d_sq;
const value_type new_rm_d_6th = new_rm_d_sq*new_rm_d_sq*new_rm_d_sq;
dE += eps_*((new_rm_d_6th*new_rm_d_6th - 2*new_rm_d_6th) - (rm_d_6th*rm_d_6th - 2*rm_d_6th));
}
return dE;
}
#if defined(_AVX256_)
inline value_type diff_avx(configuration_type const& c, std::pair<value_type,value_type> const& newpos) const
{
assert( c.first.size() == c.second.size() );
const std::size_t N = c.first.size()-1;
/* TODO: Step 1: */
const __m256 mmrm_sq = _mm256_set1_ps(rm_sq_);
const __m256 mmeps = _mm256_set1_ps(eps_);
const __m256 mmxp = _mm256_set1_ps(c.first[N]);
const __m256 mmyp = _mm256_set1_ps(c.second[N]);
const __m256 mmxnew = _mm256_set1_ps(newpos.first);
const __m256 mmynew = _mm256_set1_ps(newpos.second);
const __m256 mm2 = _mm256_set1_ps(-2.0f);
/* TODO: Step 2: */
__m256 mmdE = _mm256_setzero_ps();
/* TODO: Step 3: */
const unsigned r_width = 8;
for(std::size_t i=0; i < N/r_width; ++i){
const __m256 mmxi = _mm256_load_ps(&c.first[i*r_width]);
const __m256 mmyi = _mm256_load_ps(&c.second[i*r_width]);
const __m256 dx = _mm256_sub_ps(mmxp,mmxi);
const __m256 new_dx = _mm256_sub_ps(mmxnew,mmxi);
const __m256 dy = _mm256_sub_ps(mmyp,mmyi);
const __m256 new_dy = _mm256_sub_ps(mmynew,mmyi);
#ifdef _ACCURATE_DIV_
const __m256 rm_d_sq =
_mm256_div_ps(mmrm_sq,_mm256_add_ps(_mm256_mul_ps(dx,dx),_mm256_mul_ps(dy,dy)));
const __m256 new_rm_d_sq =
_mm256_div_ps(mmrm_sq,_mm256_add_ps(_mm256_mul_ps(new_dx,new_dx),_mm256_mul_ps(new_dy,new_dy)));
#else
const __m256 rm_d_sq =
_mm256_mul_ps(mmrm_sq,_mm256_rcp_ps(_mm256_add_ps(_mm256_mul_ps(dx,dx),_mm256_mul_ps(dy,dy))));
const __m256 new_rm_d_sq =
_mm256_mul_ps(mmrm_sq,_mm256_rcp_ps(_mm256_add_ps(_mm256_mul_ps(new_dx,new_dx),_mm256_mul_ps(new_dy,new_dy))));
#endif /* _ACCURATE_DIV_ */
const __m256 rm_d_6th =
_mm256_mul_ps(_mm256_mul_ps(rm_d_sq,rm_d_sq),rm_d_sq);
const __m256 new_rm_d_6th =
_mm256_mul_ps(_mm256_mul_ps(new_rm_d_sq,new_rm_d_sq),new_rm_d_sq);
mmdE = _mm256_add_ps(mmdE,_mm256_mul_ps(mmeps,_mm256_sub_ps(
_mm256_add_ps(_mm256_mul_ps(new_rm_d_6th,new_rm_d_6th),_mm256_mul_ps(mm2,new_rm_d_6th)),
_mm256_add_ps(_mm256_mul_ps(rm_d_6th,rm_d_6th),_mm256_mul_ps(mm2,rm_d_6th)))));
}
/* TODO: Step 4: */
float sums[r_width];
_mm256_store_ps(sums,mmdE);
float dE = std::accumulate(sums,sums+r_width,0.0);
/* TODO: Step 5: */
for(std::size_t i=(N/r_width)*r_width; i < N; ++i){
const float dx = c.first[N] - c.first[i];
const float new_dx = newpos.first - c.first[i];
const float dy = c.second[N] - c.second[i];
const float new_dy = newpos.second - c.second[i];
const float rm_d_sq = rm_sq_ / (dx*dx + dy*dy);
const float new_rm_d_sq = rm_sq_ / (new_dx*new_dx + new_dy*new_dy);
const float rm_d_6th = rm_d_sq*rm_d_sq*rm_d_sq;
const float new_rm_d_6th = new_rm_d_sq*new_rm_d_sq*new_rm_d_sq;
dE += eps_*((new_rm_d_6th*new_rm_d_6th - 2*new_rm_d_6th) - (rm_d_6th*rm_d_6th - 2*rm_d_6th));
}
return dE;
}
#elif defined(_ANY_)
// adapts to chosen register type in types.h (SSE or AVX)
inline value_type diff_any(configuration_type const& c, std::pair<value_type,value_type> const& newpos) const
{
assert( c.first.size() == c.second.size() );
const std::size_t N = c.first.size()-1;
const register_type mmrm_sq(rm_sq_);
const register_type mmeps(eps_);
const register_type mmxp(c.first[N]);
const register_type mmyp(c.second[N]);
const register_type mmxnew(newpos.first);
const register_type mmynew(newpos.second);
register_type mmdE;
const unsigned r_width = mmdE.get_register_width();
for(std::size_t i=0; i < N/r_width; ++i){
const register_type mmxi(&c.first[i*r_width]);
const register_type mmyi(&c.second[i*r_width]);
const register_type dx = mmxp - mmxi;
const register_type new_dx = mmxnew - mmxi;
const register_type dy = mmyp - mmyi;
const register_type new_dy = mmynew - mmyi;
const register_type rm_d_sq = mmrm_sq / (dx*dx + dy*dy);
const register_type new_rm_d_sq = mmrm_sq / (new_dx*new_dx + new_dy*new_dy);
const register_type rm_d_6th = rm_d_sq*rm_d_sq*rm_d_sq;
const register_type new_rm_d_6th = new_rm_d_sq*new_rm_d_sq*new_rm_d_sq;
mmdE += mmeps*((new_rm_d_6th*new_rm_d_6th - 2*new_rm_d_6th) - (rm_d_6th*rm_d_6th - 2*rm_d_6th));
}
value_type dE = mmdE.sum();
for(std::size_t i=(N/r_width)*r_width; i < N; ++i){
const value_type dx = c.first[N] - c.first[i];
const value_type new_dx = newpos.first - c.first[i];
const value_type dy = c.second[N] - c.second[i];
const value_type new_dy = newpos.second - c.second[i];
const value_type rm_d_sq = rm_sq_ / (dx*dx + dy*dy);
const value_type new_rm_d_sq = rm_sq_ / (new_dx*new_dx + new_dy*new_dy);
const value_type rm_d_6th = rm_d_sq*rm_d_sq*rm_d_sq;
const value_type new_rm_d_6th = new_rm_d_sq*new_rm_d_sq*new_rm_d_sq;
dE += eps_*((new_rm_d_6th*new_rm_d_6th - 2*new_rm_d_6th) - (rm_d_6th*rm_d_6th - 2*rm_d_6th));
}
return dE;
}
#endif
inline value_type operator()(configuration_type const& c) const
{
assert( c.first.size() == c.second.size() );
std::size_t const n = c.first.size();
value_type energy = 0.0;
for(std::size_t i=0; i < n; ++i) {
value_type energy_part = 0.0;
for(std::size_t j=0; j < i; ++j)
{
value_type const dx = c.first[i] - c.first[j];
value_type const dy = c.second[i] - c.second[j];
value_type const rm_d_sq = rm_sq_ / (dx*dx + dy*dy);
value_type const rm_d_6th = rm_d_sq*rm_d_sq*rm_d_sq;
energy_part += eps_*(rm_d_6th*rm_d_6th - 2*rm_d_6th);
}
energy += energy_part;
}
return energy;
}
private:
const value_type rm_sq_;
const value_type eps_;
};
#endif /* LENNARD_JONES_H_AHQBPV5S */
CC = g++
.PHONY = clean
debug ?= 0
avx ?= 0
auto ?= 0
print ?= 0
prec ?= float
accdiv ?= 1
any ?= 0
CXXFLAGS += -std=c++11
ifeq "$(debug)" "0"
CXXFLAGS += -O3 -DNDEBUG
else ifeq "$(debug)" "1"
CXXFLAGS += -O1 -DNDEBUG
else
CXXFLAGS += -O0
endif
ifeq "$(avx)" "1"
CXXFLAGS += -mavx -D_AVX256_
CXXFLAGS += -funsafe-math-optimizations -falign-functions=64
else ifeq "$(any)" "1"
CXXFLAGS += -mavx -D_ANY_ -DREG=0
CXXFLAGS += -funsafe-math-optimizations -falign-functions=64
else ifeq "$(any)" "2"
CXXFLAGS += -mavx -D_ANY_ -DREG=1
CXXFLAGS += -funsafe-math-optimizations -falign-functions=64
else ifeq "$(any)" "3"
CXXFLAGS += -msse4.2 -D_ANY_ -DREG=2
CXXFLAGS += -funsafe-math-optimizations -falign-functions=64
else ifeq "$(any)" "4"
CXXFLAGS += -msse4.2 -D_ANY_ -DREG=3
CXXFLAGS += -funsafe-math-optimizations -falign-functions=64
else
ifeq "$(prec)" "float"
CXXFLAGS += -D_SINGLE_PRECISION_
endif
endif
ifeq "$(accdiv)" "1"
CXXFLAGS += -D_ACCURATE_DIV_
endif
ifeq "$(auto)" "1"
CXXFLAGS += -msse -funsafe-math-optimizations -falign-functions=64 -ftree-vectorize -ftree-vectorizer-verbose=2 -fopt-info-vec
else ifeq "$(auto)" "2"
CXXFLAGS += -mavx -funsafe-math-optimizations -falign-functions=64 -ftree-vectorize -ftree-vectorizer-verbose=2 -fopt-info-vec
endif
ifeq "$(print)" "1"
CXXFLAGS += -DPRINT_CONFIG
endif
all: main
main: main.cpp
$(CC) $(CXXFLAGS) $(extra) main.cpp -o LennardJones
clean:
rm -rf LennardJones *~
// Example codes for HPC course
// (c) 2012/15 Andreas Hehn, ETH Zurich
#ifndef HPC12_ALIGNED_ALLOCATOR_HPP
#define HPC12_ALIGNED_ALLOCATOR_HPP
#ifdef _WIN32
#include <malloc.h>
#else
#include <cstdlib>
#endif
#if __cplusplus >= 201103L
#define NOEXCEPT_SPEC noexcept
#else
#define NOEXCEPT_SPEC
#endif
namespace hpc15 {
// Alignment must be a power of 2 !
template <typename T, unsigned int Alignment>
class aligned_allocator {
public:
typedef T* pointer;
typedef T const* const_pointer;
typedef T& reference;
typedef T const& const_reference;
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
template <typename U>
struct rebind {
typedef aligned_allocator<U,Alignment> other;
};
aligned_allocator() NOEXCEPT_SPEC {
}
aligned_allocator(aligned_allocator const& a) NOEXCEPT_SPEC {
}
template <typename U>
aligned_allocator(aligned_allocator<U,Alignment> const& b) NOEXCEPT_SPEC {
}
pointer allocate(size_type n) {
pointer p;
#ifdef _WIN32
p = _aligned_malloc(n*sizeof(T), Alignment);
if(p == 0)
throw std::bad_alloc();
#else
if(posix_memalign(reinterpret_cast<void**>(&p), Alignment, n * sizeof(T) ))
throw std::bad_alloc();
#endif
return p;
}
void deallocate(pointer p, size_type n) NOEXCEPT_SPEC {
std::free(p);
}
size_type max_size() const NOEXCEPT_SPEC {