diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index e42aebbe489c4e381c08f02905326eb12134291f..84078aec59856305062b9c0f50d17a2450fe3bbf 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -97,7 +97,7 @@ ARG FINN_EXP_COMMIT="af6102769226b82b639f243dc36f065340991513" ARG BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" ARG PYVERILATOR_COMMIT="0c3eb9343500fc1352a02c020a736c8c2db47e8e" ARG CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -ARG HLSLIB_COMMIT="f1eecde2d894a6d8971555924c0df147dc5ba033" +ARG HLSLIB_COMMIT="da7b47cd65a967b76554a0dda74c097803c5e550" ARG OMX_COMMIT="1dfc4aa2f2895632742cd5751520c6b472feb74e" ARG AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py index 8d401c40dc3efbfef5c387bdfc2f777a2cfb553b..97ec89daae388109d78c1d5b21aa8239e2f0dc9d 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py @@ -264,7 +264,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp): dilation_h, dilation_w = dilation # since mmv != 1 is not supported yet, we set mmv for now to 1 - # mmv = 1 + mmv = 1 # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h if self.use_parallel_window_output(): exp_cycles = k_w + ofm_dim_w @@ -272,10 +272,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp): cycles_read_block = ifm_dim_w * ifm_ch / simd cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd exp_cycles = cycles_read_block + cycles_write_block + elif self.get_nodeattr("depthwise") == 1: + if stride_h > 1 or stride_w > 1: + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ( + ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + + ofm_dim_h * max_cycles + ) + else: + cycles_read_block = ifm_ch / simd * (k_w - 1) - (k_w - 1) + cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd + exp_cycles = cycles_read_block + cycles_write_block else: - cycles_read_block = ifm_ch / simd * (k_w - 1) - (k_w - 1) - cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd - exp_cycles = cycles_read_block + cycles_write_block + exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd return int(exp_cycles) @@ -561,6 +572,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp): "ultra": "ap_resource_uram()", } hls_ram_style = map_to_hls_ram_style[ram_style] + ( + ifm_ch, + ifm_dim, + ofm_dim, + k, + stride, + dilation, + ) = self.get_1d_conv_attrs_normalized() + stride_x = np.prod(stride) # check which ConvolutionInputGenerator is needed if self.use_parallel_window_output(): @@ -586,20 +606,34 @@ class ConvolutionInputGenerator1D(HLSCustomOp): ) ] else: - hls_call = "ConvolutionInputGenerator_1D_dws_lowbuffer" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ConvKernelDim1_x, IFMChannels1, - Input_precision1, IFMDim1_x, OFMDim1_x, - SIMD1> (in0, out, numReps, {});""".format( - hls_call, hls_ram_style - ) - ] + if stride_x > 1: + # temporarily use old ConvolutionInputGenerator_NonSquare_dws + # for depthwise with stride > 1 + # note that both x and y stride are set to same (hlslib bug) + hls_call = "ConvolutionInputGenerator_NonSquare_dws" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, 1, IFMChannels1, + Input_precision1, IFMDim1_x, 1, OFMDim1_x, 1, + SIMD1, Stride1_x, Stride1_x + > (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + hls_call = "ConvolutionInputGenerator_1D_dws_lowbuffer" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, IFMChannels1, + Input_precision1, IFMDim1_x, OFMDim1_x, + SIMD1> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] else: hls_call = "ConvolutionInputGenerator_1D_lowbuffer" self.code_gen_dict["$DOCOMPUTE$"] = [ """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1, IFMDim1_x, OFMDim1_x, - SIMD1> (in0, out, numReps, {});""".format( + Stride1_x, SIMD1> (in0, out, numReps, {});""".format( hls_call, hls_ram_style ) ] diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index f7cb0a16b9d3893066eaeaae416ea0259ab6a915..6cd555c52c17e86a7d0e0da7040322636c951f7f 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -213,6 +213,10 @@ class InferConvInpGen(Transformation): 1D depthwise separable convolutions""" % n.name ) + if stride_h > 1 or stride_w > 1: + assert ( + stride_h < k_h and stride_w < k_w + ), """%s: Stride value must be smaller than kernel dim""" ConvInpGen_node = helper.make_node( "ConvolutionInputGenerator1D", [ConvInpGen_input], diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py index f97b247dd654a1ab393b0181571c63465053491b..27e1907508f94e6d65ac57d313a9b3e9dd824f5a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py @@ -156,8 +156,7 @@ def prepare_inputs(input_tensor): # input channels @pytest.mark.parametrize("ifm_ch", [1, 4]) # Stride -# @pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) -@pytest.mark.parametrize("stride", [[1, 1]]) +@pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) # Dilation @pytest.mark.parametrize("dilation", [[1, 1], [2, 1]]) # execution mode