Merge pull request #506 from mmrahorovic/feature/vitis-swu-1d

Support for 1D SWU optimized variant

Merge pull request #506 from mmrahorovic/feature/vitis-swu-1d
Support for 1D SWU optimized variant
e7ca8464 · Yaman Umuroglu · GitHub · f9d6e7ea · 2bcc816f · e7ca8464
Unverified Commit e7ca8464 authored 3 years ago by Yaman Umuroglu Committed by GitHub 3 years ago
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -97,7 +97,7 @@ ARG FINN_EXP_COMMIT="af6102769226b82b639f243dc36f065340991513"
 ARG BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
 ARG PYVERILATOR_COMMIT="0c3eb9343500fc1352a02c020a736c8c2db47e8e"
 ARG CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-ARG HLSLIB_COMMIT="f1eecde2d894a6d8971555924c0df147dc5ba033"
+ARG HLSLIB_COMMIT="da7b47cd65a967b76554a0dda74c097803c5e550"
 ARG OMX_COMMIT="1dfc4aa2f2895632742cd5751520c6b472feb74e"
 ARG AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"

--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -264,7 +264,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
        dilation_h, dilation_w = dilation
        # since mmv != 1 is not supported yet, we set mmv for now to 1
-        # mmv = 1
+        mmv = 1
        # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
        if self.use_parallel_window_output():
            exp_cycles = k_w + ofm_dim_w
@@ -272,10 +272,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
            cycles_read_block = ifm_dim_w * ifm_ch / simd
            cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
            exp_cycles = cycles_read_block + cycles_write_block
+        elif self.get_nodeattr("depthwise") == 1:
+            if stride_h > 1 or stride_w > 1:
+                cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+                cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
+                max_cycles = max(cycles_write_block, cycles_read_block)
+                exp_cycles = (
+                    ifm_dim_w * k_h * dilation_h * (ifm_ch / simd)
+                    + ofm_dim_h * max_cycles
+                )
+            else:
+                cycles_read_block = ifm_ch / simd * (k_w - 1) - (k_w - 1)
+                cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
+                exp_cycles = cycles_read_block + cycles_write_block
        else:
-            cycles_read_block = ifm_ch / simd * (k_w - 1) - (k_w - 1)
+            exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd
-            cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
-            exp_cycles = cycles_read_block + cycles_write_block
        return int(exp_cycles)
@@ -561,6 +572,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
            "ultra": "ap_resource_uram()",
        }
        hls_ram_style = map_to_hls_ram_style[ram_style]
+        (
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            k,
+            stride,
+            dilation,
+        ) = self.get_1d_conv_attrs_normalized()
+        stride_x = np.prod(stride)
        # check which ConvolutionInputGenerator is needed
        if self.use_parallel_window_output():
@@ -586,20 +606,34 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                        )
                    ]
                else:
-                    hls_call = "ConvolutionInputGenerator_1D_dws_lowbuffer"
+                    if stride_x > 1:
-                    self.code_gen_dict["$DOCOMPUTE$"] = [
+                        # temporarily use old ConvolutionInputGenerator_NonSquare_dws
-                        """{}<ConvKernelDim1_x, IFMChannels1,
+                        # for depthwise with stride > 1
-                        Input_precision1, IFMDim1_x, OFMDim1_x,
+                        # note that both x and y stride are set to same (hlslib bug)
-                        SIMD1> (in0, out, numReps, {});""".format(
+                        hls_call = "ConvolutionInputGenerator_NonSquare_dws"
-                            hls_call, hls_ram_style
+                        self.code_gen_dict["$DOCOMPUTE$"] = [
-                        )
+                            """{}<ConvKernelDim1_x, 1, IFMChannels1,
-                    ]
+                            Input_precision1, IFMDim1_x, 1, OFMDim1_x, 1,
+                            SIMD1, Stride1_x, Stride1_x
+                            > (in0, out, numReps, {});""".format(
+                                hls_call, hls_ram_style
+                            )
+                        ]
+                    else:
+                        hls_call = "ConvolutionInputGenerator_1D_dws_lowbuffer"
+                        self.code_gen_dict["$DOCOMPUTE$"] = [
+                            """{}<ConvKernelDim1_x, IFMChannels1,
+                            Input_precision1, IFMDim1_x, OFMDim1_x,
+                            SIMD1> (in0, out, numReps, {});""".format(
+                                hls_call, hls_ram_style
+                            )
+                        ]
            else:
                hls_call = "ConvolutionInputGenerator_1D_lowbuffer"
                self.code_gen_dict["$DOCOMPUTE$"] = [
                    """{}<ConvKernelDim1_x, IFMChannels1,
                    Input_precision1, IFMDim1_x, OFMDim1_x,
-                    SIMD1> (in0, out, numReps, {});""".format(
+                    Stride1_x, SIMD1> (in0, out, numReps, {});""".format(
                        hls_call, hls_ram_style
                    )
                ]

--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -213,6 +213,10 @@ class InferConvInpGen(Transformation):
                                1D depthwise separable convolutions"""
                                % n.name
                            )
+                        if stride_h > 1 or stride_w > 1:
+                            assert (
+                                stride_h < k_h and stride_w < k_w
+                            ), """%s: Stride value must be smaller than kernel dim"""
                        ConvInpGen_node = helper.make_node(
                            "ConvolutionInputGenerator1D",
                            [ConvInpGen_input],

--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -156,8 +156,7 @@ def prepare_inputs(input_tensor):
 # input channels
 @pytest.mark.parametrize("ifm_ch", [1, 4])
 # Stride
-# @pytest.mark.parametrize("stride", [[1, 1], [2, 1]])
+@pytest.mark.parametrize("stride", [[1, 1], [2, 1]])
-@pytest.mark.parametrize("stride", [[1, 1]])
 # Dilation
 @pytest.mark.parametrize("dilation", [[1, 1], [2, 1]])
 # execution mode