Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
E
exercise
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ahuegli
exercise
Commits
b5f5b370
Commit
b5f5b370
authored
Jun 10, 2020
by
ahuegli
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of gitlab.ethz.ch:hpcse20/exercise
parents
9e899ac3
c821ebab
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
27 changed files
with
1852 additions
and
0 deletions
+1852
-0
hw6/grade.py
hw6/grade.py
+86
-0
hw6/solution/p1/.gitignore
hw6/solution/p1/.gitignore
+2
-0
hw6/solution/p1/Makefile
hw6/solution/p1/Makefile
+15
-0
hw6/solution/p1/reduction_argmax.cu
hw6/solution/p1/reduction_argmax.cu
+138
-0
hw6/solution/p1/reduction_argmax.h
hw6/solution/p1/reduction_argmax.h
+126
-0
hw6/solution/p1/reduction_sum.cu
hw6/solution/p1/reduction_sum.cu
+122
-0
hw6/solution/p1/reduction_sum.h
hw6/solution/p1/reduction_sum.h
+135
-0
hw6/solution/p1/utils.h
hw6/solution/p1/utils.h
+45
-0
hw6/solution/p2/.gitignore
hw6/solution/p2/.gitignore
+7
-0
hw6/solution/p2/CMakeLists.txt
hw6/solution/p2/CMakeLists.txt
+26
-0
hw6/solution/p2/README.md
hw6/solution/p2/README.md
+27
-0
hw6/solution/p2/python/plot_output.py
hw6/solution/p2/python/plot_output.py
+26
-0
hw6/solution/p2/src/argument_parser.h
hw6/solution/p2/src/argument_parser.h
+36
-0
hw6/solution/p2/src/kernels.cu
hw6/solution/p2/src/kernels.cu
+226
-0
hw6/solution/p2/src/kernels.h
hw6/solution/p2/src/kernels.h
+19
-0
hw6/solution/p2/src/main.cpp
hw6/solution/p2/src/main.cpp
+35
-0
hw6/solution/p2/src/ssa.cu
hw6/solution/p2/src/ssa.cu
+126
-0
hw6/solution/p2/src/ssa.h
hw6/solution/p2/src/ssa.h
+38
-0
hw6/solution/p2/src/test.cu
hw6/solution/p2/src/test.cu
+56
-0
hw6/solution/p2/src/test.h
hw6/solution/p2/src/test.h
+3
-0
hw6/solution/p2/src/utils.cpp
hw6/solution/p2/src/utils.cpp
+54
-0
hw6/solution/p2/src/utils.h
hw6/solution/p2/src/utils.h
+38
-0
hw6/solution/p3/.gitignore
hw6/solution/p3/.gitignore
+3
-0
hw6/solution/p3/Makefile
hw6/solution/p3/Makefile
+12
-0
hw6/solution/p3/overlap.cu
hw6/solution/p3/overlap.cu
+195
-0
hw6/solution/p3/overlap_profile.cu
hw6/solution/p3/overlap_profile.cu
+202
-0
hw6/solution/p3/utils.h
hw6/solution/p3/utils.h
+54
-0
No files found.
hw6/grade.py
0 → 100644
View file @
b5f5b370
#!/usr/bin/env python
# File : grade.py
# Description: Generate grading submission file
# Copyright 2020 ETH Zurich. All Rights Reserved.
'''
Example:
python grade.py -q1 7 -c1 "This is why I scored 7 points" -q2 20 -q3 15 -c3 "Second comment here" -c3 "More comments"
'''
#Modify this, according to a given homework
exercise_conf
=
{
'Name'
:
'Homework 6'
,
'Questions'
:
{
'Question 1'
:
{
'Total Points'
:
60
},
'Question 2'
:
{
'Total Points'
:
40
},
'Question 3'
:
{
'Total Points'
:
40
},
}
}
'''
==========================================
Do not modify anything below this comment
==========================================
'''
import
argparse
import
datetime
import
sys
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
for
i
in
range
(
1
,
len
(
exercise_conf
[
'Questions'
])
+
1
,
1
):
parser
.
add_argument
(
'-q{:d}'
.
format
(
i
),
'--question{:d}'
.
format
(
i
),
type
=
int
,
default
=
0
,
help
=
'Scored points for Question {:d}'
.
format
(
i
))
parser
.
add_argument
(
'-c{:d}'
.
format
(
i
),
'--comment{:d}'
.
format
(
i
),
type
=
str
,
action
=
'append'
,
nargs
=
'*'
,
help
=
'Comments for Question {:d} (you can add multiple comments)'
.
format
(
i
))
return
vars
(
parser
.
parse_args
())
if
__name__
==
"__main__"
:
args
=
parse_args
()
grade
=
lambda
s
,
m
:
2.0
+
(
6.0
-
2.0
)
*
float
(
s
)
/
m
summary
=
{}
score
=
0
maxpoints
=
0
header
=
'{name:s}: {date:s}
\n
'
.
format
(
name
=
exercise_conf
[
'Name'
],
date
=
str
(
datetime
.
datetime
.
now
()))
width
=
len
(
header
.
rstrip
())
summary
[
0
]
=
[
header
]
for
i
in
range
(
1
,
len
(
exercise_conf
[
'Questions'
])
+
1
,
1
):
content
=
[]
qscore
=
args
[
'question{:d}'
.
format
(
i
)]
qmax
=
exercise_conf
[
'Questions'
][
'Question {:d}'
.
format
(
i
)][
'Total Points'
]
qscore
=
max
(
0
,
min
(
qscore
,
qmax
))
content
.
append
(
'Question {id:d}: {score:d}/{max:d}
\n
'
.
format
(
id
=
i
,
score
=
qscore
,
max
=
qmax
)
)
comments
=
args
[
'comment{:d}'
.
format
(
i
)]
if
comments
is
not
None
:
for
j
,
comment
in
enumerate
([
s
for
x
in
comments
for
s
in
x
]):
content
.
append
(
' -Comment {id:d}: {issue:s}
\n
'
.
format
(
id
=
j
+
1
,
issue
=
comment
.
strip
())
)
for
line
in
content
:
width
=
width
if
len
(
line
.
rstrip
())
<
width
else
len
(
line
.
rstrip
())
score
+=
qscore
maxpoints
+=
qmax
summary
[
i
]
=
content
assert
maxpoints
>
0
with
open
(
'grade.txt'
,
'w'
)
as
out
:
out
.
write
(
width
*
'*'
+
'
\n
'
)
for
lines
in
summary
.
values
():
for
line
in
lines
:
out
.
write
(
line
)
out
.
write
(
width
*
'*'
+
'
\n
'
)
out
.
write
(
'Grade: {:.2f}'
.
format
(
grade
(
score
,
maxpoints
)))
hw6/solution/p1/.gitignore
0 → 100644
View file @
b5f5b370
reduction_sum
reduction_argmax
hw6/solution/p1/Makefile
0 → 100644
View file @
b5f5b370
CUFLAGS
=
-O3
-std
=
c++14
--compiler-options
"-Wall -Wextra -fopenmp"
.PHONY
:
all clean
all
:
reduction_sum reduction_argmax
@
true
clean
:
rm
-f
reduction_sum reduction_argmax
reduction_sum
:
reduction_sum.cu utils.h reduction_sum.h
nvcc
$(CUFLAGS)
$<
-o
$@
reduction_argmax
:
reduction_argmax.cu utils.h reduction_argmax.h
nvcc
$(CUFLAGS)
$<
-o
$@
hw6/solution/p1/reduction_argmax.cu
0 → 100644
View file @
b5f5b370
#include "utils.h"
#include <cassert>
#include <limits>
struct
Pair
{
double
max
;
int
idx
;
};
/*
__device__ Pair shfl_xor_sync(Pair value, unsigned delta) {
return Pair{
__shfl_xor_sync(0xFFFFFFFF, value.max, delta),
__shfl_xor_sync(0xFFFFFFFF, value.idx, delta),
};
}
__device__ Pair argMaxOp(Pair a, Pair b) {
return a.max > b.max ? a : b;
}
__device__ Pair argMaxWarp(double a) {
Pair t{a, (int)threadIdx.x & 31};
t = argMaxOp(t, shfl_xor_sync(t, 1));
t = argMaxOp(t, shfl_xor_sync(t, 2));
t = argMaxOp(t, shfl_xor_sync(t, 4));
t = argMaxOp(t, shfl_xor_sync(t, 8));
t = argMaxOp(t, shfl_xor_sync(t, 16));
return t;
}
*/
/// Find the maximum value `a` among all warps and return {max value, index of
/// the max}. The result must be correct on at least the 0th thread of each warp.
__device__
Pair
argMaxWarp
(
double
a
)
{
double
t
=
a
;
// max;
t
=
max
(
t
,
__shfl_xor_sync
(
0xFFFFFFFF
,
t
,
1
));
t
=
max
(
t
,
__shfl_xor_sync
(
0xFFFFFFFF
,
t
,
2
));
t
=
max
(
t
,
__shfl_xor_sync
(
0xFFFFFFFF
,
t
,
4
));
t
=
max
(
t
,
__shfl_xor_sync
(
0xFFFFFFFF
,
t
,
8
));
t
=
max
(
t
,
__shfl_xor_sync
(
0xFFFFFFFF
,
t
,
16
));
unsigned
ballot
=
__ballot_sync
(
0xFFFFFFFF
,
a
==
t
);
int
idx
=
__ffs
(
ballot
)
-
1
;
return
{
t
,
idx
};
}
/// Second stage of argMaxBlock.
/// Returns {max value of a.value, thread index 0..1023 with the max value}.
__device__
Pair
argMaxWarp
(
Pair
a
)
{
Pair
partial
=
argMaxWarp
(
a
.
max
);
double
maxBlock
=
partial
.
max
;
int
idxBlock
=
__shfl_sync
(
0xFFFFFFFF
,
a
.
idx
,
partial
.
idx
);
return
{
maxBlock
,
idxBlock
};
}
/// Returns the argmax of all values `a` within a block,
/// with the correct answer returned at least by the 0th thread of a block.
__device__
Pair
argMaxBlock
(
double
a
)
{
__shared__
Pair
partials
[
32
];
int
warpIdx
=
threadIdx
.
x
/
warpSize
;
Pair
partial
=
argMaxWarp
(
a
);
if
(
threadIdx
.
x
%
warpSize
==
0
)
{
partials
[
warpIdx
].
max
=
partial
.
max
;
partials
[
warpIdx
].
idx
=
partial
.
idx
+
threadIdx
.
x
;
}
__syncthreads
();
if
(
warpIdx
==
0
)
return
argMaxWarp
(
partials
[
threadIdx
.
x
]);
return
Pair
{
0
,
0
};
}
/// Returns the argmax of all values `a` within a block,
/// with the correct answer returned only by the 0th thread of a block.
__device__
Pair
argMaxBlock
(
Pair
a
)
{
__shared__
Pair
partials
[
32
];
int
warpIdx
=
threadIdx
.
x
/
warpSize
;
Pair
partial
=
argMaxWarp
(
a
);
if
(
threadIdx
.
x
%
warpSize
==
0
)
partials
[
warpIdx
]
=
partial
;
__syncthreads
();
if
(
warpIdx
==
0
)
return
argMaxWarp
(
partials
[
threadIdx
.
x
]);
return
Pair
{
-
1
,
-
1
};
}
__global__
void
argMax1MKernel1
(
const
double
*
a
,
Pair
*
tmp
,
int
N
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
double
value
=
idx
<
N
?
a
[
idx
]
:
0.0
;
Pair
result
=
argMaxBlock
(
value
);
if
(
threadIdx
.
x
==
0
)
{
tmp
[
blockIdx
.
x
].
max
=
result
.
max
;
tmp
[
blockIdx
.
x
].
idx
=
result
.
idx
+
blockIdx
.
x
*
blockDim
.
x
;
}
}
__global__
void
argMax1MKernel2
(
const
Pair
*
tmp
,
Pair
*
b
,
int
numBlocks
)
{
int
idx
=
threadIdx
.
x
;
Pair
partial
=
idx
<
numBlocks
?
tmp
[
idx
]
:
Pair
{
-
1e100
,
-
1
};
// -infty should be here.
Pair
result
=
argMaxBlock
(
partial
);
if
(
threadIdx
.
x
==
0
)
*
b
=
result
;
}
void
argMax1M
(
const
double
*
aDev
,
Pair
*
bDev
,
int
N
)
{
assert
(
N
<=
1024
*
1024
);
int
blocks
=
(
N
+
1023
)
/
1024
;
Pair
*
tmpDev
;
CUDA_CHECK
(
cudaMalloc
(
&
tmpDev
,
blocks
*
sizeof
(
double
)));
argMax1MKernel1
<<<
blocks
,
1024
>>>
(
aDev
,
tmpDev
,
N
);
argMax1MKernel2
<<<
1
,
1024
>>>
(
tmpDev
,
bDev
,
blocks
);
CUDA_CHECK
(
cudaFree
(
tmpDev
));
}
#include "reduction_argmax.h"
int
main
()
{
testSmallArgMax
(
argMaxWarpTestKernel
,
argMaxWarpCheck
,
32
,
3
);
testSmallArgMax
(
argMaxWarpTestKernel
,
argMaxWarpCheck
,
32
,
32
);
testSmallArgMax
(
argMaxWarpTestKernel
,
argMaxWarpCheck
,
32
,
320
);
testSmallArgMax
(
argMaxWarpTestKernel
,
argMaxWarpCheck
,
32
,
1023123
);
printf
(
"argMaxWarp OK.
\n
"
);
testSmallArgMax
(
argMaxBlockTestKernel
,
argMaxBlockCheck
,
1024
,
32
);
testSmallArgMax
(
argMaxBlockTestKernel
,
argMaxBlockCheck
,
1024
,
1024
);
testSmallArgMax
(
argMaxBlockTestKernel
,
argMaxBlockCheck
,
1024
,
12341
);
testSmallArgMax
(
argMaxBlockTestKernel
,
argMaxBlockCheck
,
1024
,
1012311
);
printf
(
"argMaxBlock OK.
\n
"
);
testLargeArgMax
(
"argMax1M"
,
argMax1M
,
32
);
testLargeArgMax
(
"argMax1M"
,
argMax1M
,
1024
);
testLargeArgMax
(
"argMax1M"
,
argMax1M
,
12341
);
testLargeArgMax
(
"argMax1M"
,
argMax1M
,
1012311
);
printf
(
"argMax1M OK.
\n
"
);
}
hw6/solution/p1/reduction_argmax.h
0 → 100644
View file @
b5f5b370
#include <algorithm>
#include <cstdio>
#include <numeric>
#include <random>
constexpr
int
kWarpSize
=
32
;
// Kernel for testing `argMaxWarp`. Do not edit.
__global__
void
argMaxWarpTestKernel
(
const
double
*
a
,
Pair
*
b
,
int
N
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// All threads of a warp must call `argMaxWarp`!
Pair
argMax
=
argMaxWarp
(
idx
<
N
?
a
[
idx
]
:
0.0
);
if
(
threadIdx
.
x
%
warpSize
==
0
&&
idx
/
warpSize
<
(
N
+
warpSize
-
1
)
/
warpSize
)
b
[
idx
/
warpSize
]
=
argMax
;
}
/// Check results of `argMaxWarp`. Do not edit.
void
argMaxWarpCheck
(
int
N
,
int
K
,
const
double
*
aHost
,
const
Pair
*
bHost
)
{
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
int
expectedIdx
=
std
::
max_element
(
aHost
+
k
*
kWarpSize
,
aHost
+
std
::
min
((
k
+
1
)
*
kWarpSize
,
N
))
-
(
aHost
+
k
*
kWarpSize
);
Pair
expected
{
aHost
[
k
*
kWarpSize
+
expectedIdx
],
expectedIdx
};
Pair
received
=
bHost
[
k
];
if
(
expected
.
idx
!=
received
.
idx
||
expected
.
max
!=
received
.
max
)
{
printf
(
"argMaxWarp incorrect result: k=%d expected=%d %f received=%d %f
\n
"
,
k
,
expected
.
idx
,
expected
.
max
,
received
.
idx
,
received
.
max
);
exit
(
1
);
}
}
}
// Do not edit.
__global__
void
argMaxBlockTestKernel
(
const
double
*
a
,
Pair
*
b
,
int
N
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
Pair
out
=
argMaxBlock
(
idx
<
N
?
a
[
idx
]
:
0.0
);
if
(
threadIdx
.
x
==
0
)
b
[
blockIdx
.
x
]
=
out
;
}
// Do not edit.
void
argMaxBlockCheck
(
int
N
,
int
K
,
const
double
*
aHost
,
const
Pair
*
bHost
)
{
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
int
expectedIdx
=
std
::
max_element
(
aHost
+
k
*
1024
,
aHost
+
std
::
min
((
k
+
1
)
*
1024
,
N
))
-
(
aHost
+
k
*
1024
);
Pair
expected
{
aHost
[
k
*
1024
+
expectedIdx
],
expectedIdx
};
Pair
received
=
bHost
[
k
];
if
(
expected
.
idx
!=
received
.
idx
||
expected
.
max
!=
received
.
max
)
{
printf
(
"argMaxBlock incorrect result: k=%d expected=%d %f received=%d %f
\n
"
,
k
,
expected
.
idx
,
expected
.
max
,
received
.
idx
,
received
.
max
);
exit
(
1
);
}
}
}
/// Test small argmax reductions (warp-level and block-level).
template
<
typename
Kernel
,
typename
CheckFunc
>
void
testSmallArgMax
(
Kernel
kernel
,
CheckFunc
checkFunc
,
int
div
,
int
N
)
{
int
K
=
(
N
+
div
-
1
)
/
div
;
double
*
aHost
;
Pair
*
bHost
;
double
*
aDev
;
Pair
*
bDev
;
CUDA_CHECK
(
cudaMallocHost
(
&
aHost
,
N
*
sizeof
(
double
)));
CUDA_CHECK
(
cudaMallocHost
(
&
bHost
,
K
*
sizeof
(
Pair
)));
CUDA_CHECK
(
cudaMalloc
(
&
aDev
,
N
*
sizeof
(
double
)));
CUDA_CHECK
(
cudaMalloc
(
&
bDev
,
K
*
sizeof
(
Pair
)));
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
// aHost[i] = (long long)i * i % 12345;
aHost
[
i
]
=
10
*
i
;
}
std
::
mt19937
gen
;
std
::
shuffle
(
aHost
,
aHost
+
N
,
gen
);
CUDA_CHECK
(
cudaMemcpy
(
aDev
,
aHost
,
N
*
sizeof
(
double
),
cudaMemcpyHostToDevice
));
const
int
threads
=
1024
;
const
int
blocks
=
(
N
+
threads
-
1
)
/
threads
;
kernel
<<<
blocks
,
threads
>>>
(
aDev
,
bDev
,
N
);
CUDA_CHECK
(
cudaMemcpy
(
bHost
,
bDev
,
K
*
sizeof
(
Pair
),
cudaMemcpyDeviceToHost
));
checkFunc
(
N
,
K
,
aHost
,
bHost
);
CUDA_CHECK
(
cudaFree
(
bDev
));
CUDA_CHECK
(
cudaFree
(
aDev
));
CUDA_CHECK
(
cudaFreeHost
(
bHost
));
CUDA_CHECK
(
cudaFreeHost
(
aHost
));
}
/// Test large reductions (up to 1024^3 and larger).
template
<
typename
Func
>
void
testLargeArgMax
(
const
char
*
name
,
Func
func
,
int
N
)
{
double
*
aHost
;
double
*
aDev
;
Pair
*
bDev
;
CUDA_CHECK
(
cudaMallocHost
(
&
aHost
,
N
*
sizeof
(
double
)));
CUDA_CHECK
(
cudaMalloc
(
&
aDev
,
N
*
sizeof
(
double
)));
CUDA_CHECK
(
cudaMalloc
(
&
bDev
,
1
*
sizeof
(
Pair
)));
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
// aHost[i] = (N + 13241LL * i * i) % 432141;
aHost
[
i
]
=
10
*
i
;
}
std
::
mt19937
gen
;
std
::
shuffle
(
aHost
,
aHost
+
N
,
gen
);
CUDA_CHECK
(
cudaMemcpy
(
aDev
,
aHost
,
N
*
sizeof
(
double
),
cudaMemcpyHostToDevice
));
func
(
aDev
,
bDev
,
N
);
int
expectedIdx
=
std
::
max_element
(
aHost
,
aHost
+
N
)
-
aHost
;
Pair
expected
{
aHost
[
expectedIdx
],
expectedIdx
};
Pair
received
;
CUDA_CHECK
(
cudaMemcpy
(
&
received
,
bDev
,
1
*
sizeof
(
Pair
),
cudaMemcpyDeviceToHost
));
if
(
expected
.
idx
!=
received
.
idx
||
expected
.
max
!=
received
.
max
)
{
printf
(
"large %s incorrect result: N=%d expected=%d %f received=%d %f
\n
"
,
name
,
N
,
expected
.
idx
,
expected
.
max
,
received
.
idx
,
received
.
max
);
exit
(
1
);
}
CUDA_CHECK
(
cudaFree
(
bDev
));
CUDA_CHECK
(
cudaFree
(
aDev
));
CUDA_CHECK
(
cudaFreeHost
(
aHost
));
}
hw6/solution/p1/reduction_sum.cu
0 → 100644
View file @
b5f5b370
#include "utils.h"
#include <cassert>
#include <algorithm>
/// Returns the sum of all values `a` within a warp,
/// with the correct answer returned only by the 0th thread of a warp.
__device__
double
sumWarp
(
double
a
)
{
a
+=
__shfl_down_sync
(
0xFFFFFFFF
,
a
,
1
);
a
+=
__shfl_down_sync
(
0xFFFFFFFF
,
a
,
2
);
a
+=
__shfl_down_sync
(
0xFFFFFFFF
,
a
,
4
);
a
+=
__shfl_down_sync
(
0xFFFFFFFF
,
a
,
8
);
a
+=
__shfl_down_sync
(
0xFFFFFFFF
,
a
,
16
);
return
a
;
}
/// Returns the sum of all values `a` within a warp,
/// with the correct answer returned by all threads of a warp.
__device__
double
sumWarpAll
(
double
a
)
{
a
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
a
,
1
);
a
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
a
,
2
);
a
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
a
,
4
);
a
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
a
,
8
);
a
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
a
,
16
);
return
a
;
}
/// Returns the sum of all values `a` within a block,
/// with the correct answer returned only by the 0th thread of a block.
__device__
double
sumBlock
(
double
a
)
{
__shared__
double
warpSums
[
32
];
int
warpIdx
=
threadIdx
.
x
/
warpSize
;
double
warpSum
=
sumWarp
(
a
);
if
(
threadIdx
.
x
%
warpSize
==
0
)
warpSums
[
warpIdx
]
=
warpSum
;
__syncthreads
();
double
blockSum
=
0
;
if
(
warpIdx
==
0
)
blockSum
=
sumWarp
(
warpSums
[
threadIdx
.
x
]);
return
blockSum
;
}
__global__
void
sum1MKernel
(
const
double
*
a
,
double
*
b
,
int
N
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
double
value
=
idx
<
N
?
a
[
idx
]
:
0.0
;
double
sum
=
sumBlock
(
value
);
if
(
threadIdx
.
x
==
0
)
b
[
blockIdx
.
x
]
=
sum
;
}
/// Compute the sum of all values aDev[0]..aDev[N-1] for N <= 1024^2 and store the result to bDev[0].
void
sum1M
(
const
double
*
aDev
,
double
*
bDev
,
int
N
)
{
assert
(
N
<=
1024
*
1024
);
int
blocks
=
(
N
+
1023
)
/
1024
;
double
*
tmpDev
;
CUDA_CHECK
(
cudaMalloc
(
&
tmpDev
,
blocks
*
sizeof
(
double
)));
sum1MKernel
<<<
blocks
,
1024
>>>
(
aDev
,
tmpDev
,
N
);
sum1MKernel
<<<
1
,
1024
>>>
(
tmpDev
,
bDev
,
blocks
);
CUDA_CHECK
(
cudaFree
(
tmpDev
));
}
__global__
void
sumVeryLargeKernel
(
const
double
*
a
,
double
*
b
,
int
N
)
{
double
value
=
0.0
;
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
N
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
value
+=
a
[
idx
];
double
sum
=
sumBlock
(
value
);
if
(
threadIdx
.
x
==
0
)
b
[
blockIdx
.
x
]
=
sum
;
}
// Implementation of (1e) (note: it was not required to implement it).
void
sumVeryLarge
(
const
double
*
aDev
,
double
*
bDev
,
int
N
)
{
int
blocks
=
std
::
min
(
1024
,
(
N
+
1023
)
/
1024
);
double
*
tmpDev
;
CUDA_CHECK
(
cudaMalloc
(
&
tmpDev
,
blocks
*
sizeof
(
double
)));
sumVeryLargeKernel
<<<
blocks
,
1024
>>>
(
aDev
,
tmpDev
,
N
);
sumVeryLargeKernel
<<<
1
,
1024
>>>
(
aDev
,
bDev
,
N
);
CUDA_CHECK
(
cudaFree
(
tmpDev
));
}
#include "reduction_sum.h"
int
main
()
{
testSmallSum
(
sumWarpTestKernel
,
sumWarpCheck
,
32
,
3
);
testSmallSum
(
sumWarpTestKernel
,
sumWarpCheck
,
32
,
32
);
testSmallSum
(
sumWarpTestKernel
,
sumWarpCheck
,
32
,
320
);
testSmallSum
(
sumWarpTestKernel
,
sumWarpCheck
,
32
,
1023123
);
printf
(
"sumWarp OK.
\n
"
);
testSmallSum
(
sumWarpAllTestKernel
,
sumWarpAllCheck
,
1
,
3
);
testSmallSum
(
sumWarpAllTestKernel
,
sumWarpAllCheck
,
1
,
32
);
testSmallSum
(
sumWarpAllTestKernel
,
sumWarpAllCheck
,
1
,
320
);
testSmallSum
(
sumWarpAllTestKernel
,
sumWarpAllCheck
,
1
,
1023123
);
printf
(
"sumWarpAll OK.
\n
"
);
testSmallSum
(
sumBlockTestKernel
,
sumBlockCheck
,
1024
,
32
);
testSmallSum
(
sumBlockTestKernel
,
sumBlockCheck
,
1024
,
1024
);
testSmallSum
(
sumBlockTestKernel
,
sumBlockCheck
,
1024
,
12341
);
testSmallSum
(
sumBlockTestKernel
,
sumBlockCheck
,
1024
,
1012311
);
printf
(
"sumBlock OK.
\n
"
);
testLargeSum
(
"sum1M"
,
sum1M
,
32
);
testLargeSum
(
"sum1M"
,
sum1M
,
1024
);
testLargeSum
(
"sum1M"
,
sum1M
,
12341
);
testLargeSum
(
"sum1M"
,
sum1M
,
1012311
);
printf
(
"sum1M OK.
\n
"
);
testLargeSum
(
"sumVeryLarge"
,
sumVeryLarge
,
32
);
testLargeSum
(
"sumVeryLarge"
,
sumVeryLarge
,
1024
);
testLargeSum
(
"sumVeryLarge"
,
sumVeryLarge
,
12341
);
testLargeSum
(
"sumVeryLarge"
,
sumVeryLarge
,
1012311
);
testLargeSum
(
"sumVeryLarge"
,
sumVeryLarge
,
1001002003
);
printf
(
"sumVeryLarge OK.
\n
"
);
}
hw6/solution/p1/reduction_sum.h
0 → 100644
View file @
b5f5b370
#pragma once
#include "utils.h"
#include <algorithm>
#include <numeric>
constexpr
int
kWarpSize
=
32
;
/// Kernel for testing `sumWarp`. Do not edit.
__global__
void
sumWarpTestKernel
(
const
double
*
a
,
double
*
b
,
int
N
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// All threads of a warp must call `sumWarp`!
double
sum
=
sumWarp
(
idx
<
N
?
a
[
idx
]
:
0.0
);
if
(
threadIdx
.
x
%
warpSize
==
0
&&
idx
/
warpSize
<
(
N
+
warpSize
-
1
)
/
warpSize
)
b
[
idx
/
warpSize
]
=
sum
;
}
/// Check results of `sumWarp`. Do not edit.
inline
void
sumWarpCheck
(
int
N
,
int
K
,
const
double
*
aHost
,
const
double
*
bHost
)
{
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
double
expected
=
std
::
accumulate
(
aHost
+
k
*
kWarpSize
,
aHost
+
std
::
min
((
k
+
1
)
*
kWarpSize
,
N
),
0.0
);
double
received
=
bHost
[
k
];
if
(
expected
!=
received
)
{
printf
(
"sumWarp incorrect result: k=%d expected=%f received=%f
\n
"
,
k
,
expected
,
received
);
exit
(
1
);
}
}
}
/// Kernel for testing `sumWarpAll`. Do not edit.
__global__
void
sumWarpAllTestKernel
(
const
double
*
a
,
double
*
b
,
int
N
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// All threads of a warp must call `sumWarpAll`!
double
sum
=
sumWarpAll
(
idx
<
N
?
a
[
idx
]
:
0.0
);
if
(
idx
<
N
)
b
[
idx
]
=
sum
;
}
/// Check result of `sumWarpAll`. Do not edit.
inline
void
sumWarpAllCheck
(
int
N
,
int
K
,
const
double
*
aHost
,
const
double
*
bHost
)
{
for
(
int
k
=
0
;
k
<
K
;
k
+=
kWarpSize
)
{
double
expected
=
std
::
accumulate
(
aHost
+
k
,
aHost
+
std
::
min
(
k
+
kWarpSize
,
N
),
0.0
);
for
(
int
j
=
k
;
j
<
std
::
min
(
k
+
kWarpSize
,
N
);
++
j
)
{
double
received
=
bHost
[
j
];
if
(
expected
!=
received
)
{
printf
(
"sumWarpAll incorrect result: k=%d j=%d expected=%f received=%f
\n
"
,
k
,
j
,
expected
,
received
);
exit
(
1
);
}
}
}
}
// Do not edit.
__global__
void
sumBlockTestKernel
(
const
double
*
a
,
double
*
b
,
int
N
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
double
sum
=
sumBlock
(
idx
<
N
?
a
[
idx
]
:
0.0
);
if
(
threadIdx
.
x
==
0
)
b
[
blockIdx
.
x
]
=
sum
;
}
// Do not edit.
inline
void
sumBlockCheck
(
int
N
,
int
K
,
const
double
*
aHost
,
const
double
*
bHost
)
{
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
double
expected
=
std
::
accumulate
(
aHost
+
k
*
1024
,
aHost
+
std
::
min
((
k
+
1
)
*
1024
,
N
),
0.0
);
double
received
=
bHost
[
k
];
if
(
expected
!=
received
)
{
printf
(
"sumBlock incorrect result: k=%d expected=%f received=%f
\n
"
,
k
,
expected
,
received
);
exit
(
1
);
}
}
}
/// Test small reductions (warp-level and block-level).
template
<
typename
Kernel
,
typename
CheckFunc
>
void
testSmallSum
(
Kernel
kernel
,
CheckFunc
checkFunc
,
int
div
,
int
N
)
{
int
K
=
(
N
+
div
-
1
)
/
div
;
double
*
aHost
;
double
*
bHost
;
double
*
aDev
;
double
*
bDev
;
CUDA_CHECK
(
cudaMallocHost
(
&
aHost
,
N
*
sizeof
(
double
)));
CUDA_CHECK
(
cudaMallocHost
(
&
bHost
,
K
*
sizeof
(
double
)));
CUDA_CHECK
(
cudaMalloc
(
&
aDev
,
N
*
sizeof
(
double
)));
CUDA_CHECK
(
cudaMalloc
(
&
bDev
,
K
*
sizeof
(
double
)));
for
(
int
i
=
0
;
i
<
N
;
++
i
)
aHost
[
i
]
=
i
;
CUDA_CHECK
(
cudaMemcpy
(
aDev
,
aHost
,
N
*
sizeof
(
double
),
cudaMemcpyHostToDevice
));
const
int
threads
=
1024
;
const
int
blocks
=
(
N
+
threads
-
1
)
/
threads
;
kernel
<<<
blocks
,
threads
>>>
(
aDev
,
bDev
,
N
);
CUDA_CHECK
(
cudaMemcpy
(
bHost
,
bDev
,
K
*
sizeof
(
double
),
cudaMemcpyDeviceToHost
));
checkFunc
(
N
,
K
,
aHost
,
bHost
);
CUDA_CHECK
(
cudaFree
(
bDev
));
CUDA_CHECK
(
cudaFree
(
aDev
));
CUDA_CHECK
(
cudaFreeHost
(
bHost
));
CUDA_CHECK
(
cudaFreeHost
(
aHost
));
}
/// Test large reductions (up to 1024^3 and larger).
template
<
typename
Func
>
void
testLargeSum
(
const
char
*
name
,
Func
func
,
int
N
)
{
double
*
aHost
;
double
*
aDev
;
double
*
bDev
;