Skip to content
Snippets Groups Projects
Commit b5ca2f70 authored by Felix Jentzsch's avatar Felix Jentzsch
Browse files

Initial support for new 1D SWG in case of full SIMD unfolding

parent f98cc214
No related branches found
No related tags found
No related merge requests found
...@@ -127,8 +127,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ...@@ -127,8 +127,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
wf = int((k_h * k_w * ifm_ch) // simd) if self.use_parallel_window_output():
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) wf = int((ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
else:
wf = int((k_h * k_w * ifm_ch) // simd)
folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
return folded_oshape return folded_oshape
def make_shape_compatible_op(self, model): def make_shape_compatible_op(self, model):
...@@ -156,8 +160,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ...@@ -156,8 +160,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return DataType[self.get_nodeattr("outputDataType")] return DataType[self.get_nodeattr("outputDataType")]
def get_instream_width(self): def get_instream_width(self):
"""Returns stream width, input and output stream width are equal for
the sliding window function"""
ibits = self.get_input_datatype().bitwidth() ibits = self.get_input_datatype().bitwidth()
simd = self.get_nodeattr("SIMD") simd = self.get_nodeattr("SIMD")
ifm_ch = self.get_nodeattr("IFMChannels") ifm_ch = self.get_nodeattr("IFMChannels")
...@@ -166,10 +168,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ...@@ -166,10 +168,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return in_width return in_width
def get_outstream_width(self): def get_outstream_width(self):
"""Returns stream width, input and output stream width are equal for if self.use_parallel_window_output():
the sliding window function, so the function to determine the input # feed all window pixels in parallel
stream width can be reused.""" k_h, k_w = self.get_nodeattr("ConvKernelDim")
return self.get_instream_width() return self.get_instream_width() * k_h * k_w
else:
# if parallel variant not in use: same width for output and input stream
return self.get_instream_width()
def get_number_output_values(self): def get_number_output_values(self):
folded_oshape = self.get_folded_output_shape() folded_oshape = self.get_folded_output_shape()
...@@ -205,6 +210,22 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ...@@ -205,6 +210,22 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
def use_parallel_window_output(self):
# Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
# feed window in parallel to the following layer, enabling full SIMD unfolding.
stride = self.get_nodeattr("Stride")
dilation = self.get_nodeattr("Dilation")
stride_h, stride_w = stride
dilation_h, dilation_w = dilation
if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
if self.get_nodeattr("depthwise") == 0:
if stride_h == 1 and stride_w == 1:
if dilation_h == 1 and dilation_w == 1:
return True
else:
return False
def get_exp_cycles(self): def get_exp_cycles(self):
simd = self.get_nodeattr("SIMD") simd = self.get_nodeattr("SIMD")
( (
...@@ -224,12 +245,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ...@@ -224,12 +245,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
# since mmv != 1 is not supported yet, we set mmv for now to 1 # since mmv != 1 is not supported yet, we set mmv for now to 1
mmv = 1 mmv = 1
# see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv if self.use_parallel_window_output():
cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) exp_cycles = k_w + ofm_dim_w
max_cycles = max(cycles_write_block, cycles_read_block) else:
exp_cycles = ( cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
) max_cycles = max(cycles_write_block, cycles_read_block)
exp_cycles = (
ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
)
return int(exp_cycles) return int(exp_cycles)
...@@ -522,39 +546,49 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ...@@ -522,39 +546,49 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
"ultra": "ap_resource_uram()", "ultra": "ap_resource_uram()",
} }
hls_ram_style = map_to_hls_ram_style[ram_style] hls_ram_style = map_to_hls_ram_style[ram_style]
hls_call = "ConvolutionInputGenerator"
# check which ConvolutionInputGenerator is needed
dilation_h, dilation_w = self.get_nodeattr("Dilation")
hls_call += "_NonSquare" # check which ConvolutionInputGenerator is needed
if dilation_h > 1 or dilation_w > 1: if self.use_parallel_window_output():
hls_call += "_Dilated" hls_call = "ConvolutionInputGenerator_1D_parallel"
if self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y,
Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
elif self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [ self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y> IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
(in0, out, numReps, {});""".format( (in0, out, numReps, {});""".format(
hls_call, hls_ram_style hls_call, hls_ram_style
) )
] ]
else: else:
self.code_gen_dict["$DOCOMPUTE$"] = [ hls_call = "ConvolutionInputGenerator_NonSquare"
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, dilation_h, dilation_w = self.get_nodeattr("Dilation")
IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y> if dilation_h > 1 or dilation_w > 1:
(in0, out, numReps, {});""".format( hls_call += "_Dilated"
hls_call, hls_ram_style if self.get_nodeattr("depthwise") == 1:
) hls_call += "_dws"
] self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
(in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
elif self.get_nodeattr("depthwise") == 1:
hls_call += "_dws"
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
else:
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
hls_call, hls_ram_style
)
]
def dataoutstrm(self): def dataoutstrm(self):
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
...@@ -587,12 +621,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ...@@ -587,12 +621,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
self.code_gen_dict["$SAVEASCNPY$"] = [] self.code_gen_dict["$SAVEASCNPY$"] = []
def blackboxfunction(self): def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ if self.use_parallel_window_output():
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0, self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format( """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
self.onnx_node.name hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>>
) &out)""".format(
] self.onnx_node.name
)
]
else:
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
"""void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
self.onnx_node.name
)
]
def pragmas(self): def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment