Skip to content
Snippets Groups Projects
Commit b5ca2f70 authored by Felix Jentzsch's avatar Felix Jentzsch
Browse files

Initial support for new 1D SWG in case of full SIMD unfolding

parent f98cc214
No related branches found
No related tags found
No related merge requests found
......@@ -127,8 +127,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
wf = int((k_h * k_w * ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
if self.use_parallel_window_output():
wf = int((ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
else:
wf = int((k_h * k_w * ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
return folded_oshape
def make_shape_compatible_op(self, model):
......@@ -156,8 +160,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return DataType[self.get_nodeattr("outputDataType")]
def get_instream_width(self):
"""Returns stream width, input and output stream width are equal for
the sliding window function"""
ibits = self.get_input_datatype().bitwidth()
simd = self.get_nodeattr("SIMD")
ifm_ch = self.get_nodeattr("IFMChannels")
......@@ -166,10 +168,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return in_width
def get_outstream_width(self):
"""Returns stream width, input and output stream width are equal for
the sliding window function, so the function to determine the input
stream width can be reused."""
return self.get_instream_width()
if self.use_parallel_window_output():
# feed all window pixels in parallel
k_h, k_w = self.get_nodeattr("ConvKernelDim")
return self.get_instream_width() * k_h * k_w
else:
# if parallel variant not in use: same width for output and input stream
return self.get_instream_width()
def get_number_output_values(self):
folded_oshape = self.get_folded_output_shape()
......@@ -205,6 +210,22 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
def use_parallel_window_output(self):
# Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
# feed window in parallel to the following layer, enabling full SIMD unfolding.
stride = self.get_nodeattr("Stride")
dilation = self.get_nodeattr("Dilation")
stride_h, stride_w = stride
dilation_h, dilation_w = dilation
if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
if self.get_nodeattr("depthwise") == 0:
if stride_h == 1 and stride_w == 1:
if dilation_h == 1 and dilation_w == 1:
return True
else:
return False
def get_exp_cycles(self):
simd = self.get_nodeattr("SIMD")
(
......@@ -224,12 +245,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
# since mmv != 1 is not supported yet, we set mmv for now to 1
mmv = 1
# see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
max_cycles = max(cycles_write_block, cycles_read_block)
exp_cycles = (
ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
)
if self.use_parallel_window_output():
exp_cycles = k_w + ofm_dim_w
else:
cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
max_cycles = max(cycles_write_block, cycles_read_block)
exp_cycles = (
ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
)
return int(exp_cycles)
......@@ -522,39 +546,49 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
"ultra": "ap_resource_uram()",
}
hls_ram_style = map_to_hls_ram_style[ram_style]
hls_call = "ConvolutionInputGenerator"
# check which ConvolutionInputGenerator is needed
dilation_h, dilation_w = self.get_nodeattr("Dilation")
hls_call += "_NonSquare"
if dilation_h > 1 or dilation_w > 1:
hls_call += "_Dilated"
if self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y,
Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
elif self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
# check which ConvolutionInputGenerator is needed
if self.use_parallel_window_output():
hls_call = "ConvolutionInputGenerator_1D_parallel"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
"""{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
(in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
(in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
hls_call = "ConvolutionInputGenerator_NonSquare"
dilation_h, dilation_w = self.get_nodeattr("Dilation")
if dilation_h > 1 or dilation_w > 1:
hls_call += "_Dilated"
if self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
(in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
elif self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
def dataoutstrm(self):
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
......@@ -587,12 +621,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
self.code_gen_dict["$SAVEASCNPY$"] = []
def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
self.onnx_node.name
)
]
if self.use_parallel_window_output():
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>>
&out)""".format(
self.onnx_node.name
)
]
else:
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
self.onnx_node.name
)
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment