diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py index c4cf804126328b27fd56091d70f0b6e658b5b3c1..d167f2312ff240ed3bace1ef6bdd69d243cc399e 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py @@ -127,8 +127,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + if self.use_parallel_window_output(): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) return folded_oshape def make_shape_compatible_op(self, model): @@ -156,8 +160,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp): return DataType[self.get_nodeattr("outputDataType")] def get_instream_width(self): - """Returns stream width, input and output stream width are equal for - the sliding window function""" ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -166,10 +168,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp): return in_width def get_outstream_width(self): - """Returns stream width, input and output stream width are equal for - the sliding window function, so the function to determine the input - stream width can be reused.""" - return self.get_instream_width() + if self.use_parallel_window_output(): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() @@ -205,6 +210,22 @@ class ConvolutionInputGenerator1D(HLSCustomOp): return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + def use_parallel_window_output(self): + # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to + # feed window in parallel to the following layer, enabling full SIMD unfolding. + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"): + if self.get_nodeattr("depthwise") == 0: + if stride_h == 1 and stride_w == 1: + if dilation_h == 1 and dilation_w == 1: + return True + else: + return False + def get_exp_cycles(self): simd = self.get_nodeattr("SIMD") ( @@ -224,12 +245,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp): # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv - cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) - max_cycles = max(cycles_write_block, cycles_read_block) - exp_cycles = ( - ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles - ) + if self.use_parallel_window_output(): + exp_cycles = k_w + ofm_dim_w + else: + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ( + ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + ) return int(exp_cycles) @@ -522,39 +546,49 @@ class ConvolutionInputGenerator1D(HLSCustomOp): "ultra": "ap_resource_uram()", } hls_ram_style = map_to_hls_ram_style[ram_style] - hls_call = "ConvolutionInputGenerator" - # check which ConvolutionInputGenerator is needed - dilation_h, dilation_w = self.get_nodeattr("Dilation") - hls_call += "_NonSquare" - if dilation_h > 1 or dilation_w > 1: - hls_call += "_Dilated" - if self.get_nodeattr("depthwise") == 1: - hls_call += "_dws" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, - IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y, - Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format( - hls_call, hls_ram_style - ) - ] - elif self.get_nodeattr("depthwise") == 1: - hls_call += "_dws" + # check which ConvolutionInputGenerator is needed + if self.use_parallel_window_output(): + hls_call = "ConvolutionInputGenerator_1D_parallel" self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, - IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y> + """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1, + IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x> (in0, out, numReps, {});""".format( hls_call, hls_ram_style ) ] else: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, - IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y> - (in0, out, numReps, {});""".format( - hls_call, hls_ram_style - ) - ] + hls_call = "ConvolutionInputGenerator_NonSquare" + dilation_h, dilation_w = self.get_nodeattr("Dilation") + if dilation_h > 1 or dilation_w > 1: + hls_call += "_Dilated" + if self.get_nodeattr("depthwise") == 1: + hls_call += "_dws" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, + Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, + SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y> + (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + elif self.get_nodeattr("depthwise") == 1: + hls_call += "_dws" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, + Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, + SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, + Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, + SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -587,12 +621,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0, - hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format( - self.onnx_node.name - ) - ] + if self.use_parallel_window_output(): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0, + hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>> + &out)""".format( + self.onnx_node.name + ) + ] + else: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0, + hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format( + self.onnx_node.name + ) + ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]