Skip to content
Snippets Groups Projects
Commit 5072927e authored by Yaman Umuroglu's avatar Yaman Umuroglu
Browse files

Merge branch 'feature/support_1d_swg_simple' of...

Merge branch 'feature/support_1d_swg_simple' of https://github.com/fpjentzsch/finn into fpjentzsch-feature/support_1d_swg_simple
parents 918ba008 a975909c
No related branches found
No related tags found
No related merge requests found
......@@ -127,8 +127,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
wf = int((k_h * k_w * ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
if self.use_parallel_window_output():
wf = int((ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
else:
wf = int((k_h * k_w * ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
return folded_oshape
def make_shape_compatible_op(self, model):
......@@ -156,8 +160,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return DataType[self.get_nodeattr("outputDataType")]
def get_instream_width(self):
"""Returns stream width, input and output stream width are equal for
the sliding window function"""
ibits = self.get_input_datatype().bitwidth()
simd = self.get_nodeattr("SIMD")
ifm_ch = self.get_nodeattr("IFMChannels")
......@@ -166,10 +168,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return in_width
def get_outstream_width(self):
"""Returns stream width, input and output stream width are equal for
the sliding window function, so the function to determine the input
stream width can be reused."""
return self.get_instream_width()
if self.use_parallel_window_output():
# feed all window pixels in parallel
k_h, k_w = self.get_nodeattr("ConvKernelDim")
return self.get_instream_width() * k_h * k_w
else:
# if parallel variant not in use: same width for output and input stream
return self.get_instream_width()
def get_number_output_values(self):
folded_oshape = self.get_folded_output_shape()
......@@ -205,6 +210,22 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
def use_parallel_window_output(self):
# Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
# feed window in parallel to the following layer, enabling full SIMD unfolding.
stride = self.get_nodeattr("Stride")
dilation = self.get_nodeattr("Dilation")
stride_h, stride_w = stride
dilation_h, dilation_w = dilation
if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
if self.get_nodeattr("depthwise") == 0:
if stride_h == 1 and stride_w == 1:
if dilation_h == 1 and dilation_w == 1:
return True
return False
def get_exp_cycles(self):
simd = self.get_nodeattr("SIMD")
(
......@@ -224,12 +245,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
# since mmv != 1 is not supported yet, we set mmv for now to 1
mmv = 1
# see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
max_cycles = max(cycles_write_block, cycles_read_block)
exp_cycles = (
ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
)
if self.use_parallel_window_output():
exp_cycles = k_w + ofm_dim_w
else:
cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
max_cycles = max(cycles_write_block, cycles_read_block)
exp_cycles = (
ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
)
return int(exp_cycles)
......@@ -522,39 +546,49 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
"ultra": "ap_resource_uram()",
}
hls_ram_style = map_to_hls_ram_style[ram_style]
hls_call = "ConvolutionInputGenerator"
# check which ConvolutionInputGenerator is needed
dilation_h, dilation_w = self.get_nodeattr("Dilation")
hls_call += "_NonSquare"
if dilation_h > 1 or dilation_w > 1:
hls_call += "_Dilated"
if self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y,
Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
elif self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
# check which ConvolutionInputGenerator is needed
if self.use_parallel_window_output():
hls_call = "ConvolutionInputGenerator_1D_parallel"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
"""{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
(in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
(in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
hls_call = "ConvolutionInputGenerator_NonSquare"
dilation_h, dilation_w = self.get_nodeattr("Dilation")
if dilation_h > 1 or dilation_w > 1:
hls_call += "_Dilated"
if self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
(in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
elif self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
def dataoutstrm(self):
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
......@@ -570,9 +604,16 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
npy_out = "%s/output.npy" % code_gen_dir
oshape = self.get_folded_output_shape()
oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
if self.use_parallel_window_output():
# pass the number of pixels in the folded output to apintstream2npy, needed
# to unpack the ouput correctly and reverse only the inner SIMD dimension
k_h, k_w = self.get_nodeattr("ConvKernelDim")
multi_pixel_out = k_h * k_w
else:
multi_pixel_out = 1
self.code_gen_dict["$DATAOUTSTREAM$"] = [
'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);'
% (
packed_hls_type,
elem_hls_type,
......@@ -580,6 +621,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
npy_type,
oshape_cpp_str,
npy_out,
multi_pixel_out,
)
]
......@@ -587,12 +629,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
self.code_gen_dict["$SAVEASCNPY$"] = []
def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
self.onnx_node.name
)
]
if self.use_parallel_window_output():
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>>
&out)""".format(
self.onnx_node.name
)
]
else:
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
self.onnx_node.name
)
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
......
......@@ -45,29 +45,34 @@ void npy2apintstream(const char * npy_path, hls::stream<PackedT> & out_stream, b
}
template <typename PackedT, typename ElemT, int ElemBits, typename NpyT>
void apintstream2npy(hls::stream<PackedT> & in_stream, const std::vector<size_t> & shape, const char * npy_path, bool reverse_inner = true, size_t numReps = 1) {
void apintstream2npy(hls::stream<PackedT> & in_stream, const std::vector<size_t> & shape, const char * npy_path, bool reverse_inner = true, size_t numReps = 1, size_t multi_pixel_out = 1) {
for(size_t rep = 0; rep < numReps; rep++) {
std::vector<NpyT> data_to_save;
size_t outer_dim_elems = 1;
for(size_t dim = 0; dim < shape.size()-1; dim++) {
outer_dim_elems *= shape[dim];
}
size_t inner_dim_elems = shape[shape.size()-1];
DEBUG_APINTSTREAM2NPY("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems)
size_t inner_dim_elems = shape[shape.size()-1] / multi_pixel_out;
DEBUG_APINTSTREAM2NPY("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems << " n_multi_pixel_out " << multi_pixel_out)
for(size_t outer_elem = 0; outer_elem < outer_dim_elems; outer_elem++) {
PackedT packed_elem;
in_stream >> packed_elem;
DEBUG_APINTSTREAM2NPY("packed hls elem " << std::hex << packed_elem << std::dec)
for(size_t ii = 0; ii < inner_dim_elems; ii++) {
size_t i = reverse_inner ? inner_dim_elems-ii-1 : ii;
ap_uint<ElemBits> tmp_elem = packed_elem((i+1)*ElemBits-1, i*ElemBits);
// important: don't init elem = reinterpret_cast.. directly here
// this causes weird behavior for conversion to NpyT afterwards
ElemT elem;
elem = reinterpret_cast<ElemT&>(tmp_elem);
NpyT npyt = (NpyT) elem;
DEBUG_APINTSTREAM2NPY("elem " << elem << " NpyT " << npyt)
data_to_save.push_back(npyt);
for(size_t ii_multi_pixel_out = 0; ii_multi_pixel_out < multi_pixel_out; ii_multi_pixel_out++) {
// loop over multi_pixel_out blocks of inner_dim_elems separately,
// so that reverse_inner is not applied across multiple pixels
for(size_t ii = 0; ii < inner_dim_elems; ii++) {
size_t i = ii_multi_pixel_out*inner_dim_elems;
i += reverse_inner ? inner_dim_elems-ii-1 : ii;
ap_uint<ElemBits> tmp_elem = packed_elem((i+1)*ElemBits-1, i*ElemBits);
// important: don't init elem = reinterpret_cast.. directly here
// this causes weird behavior for conversion to NpyT afterwards
ElemT elem;
elem = reinterpret_cast<ElemT&>(tmp_elem);
NpyT npyt = (NpyT) elem;
DEBUG_APINTSTREAM2NPY("elem " << elem << " NpyT " << npyt)
data_to_save.push_back(npyt);
}
}
}
cnpy::npy_save(npy_path, &data_to_save[0], shape, "w");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment