From 048faaf6603e2b51e8d2dd8c10da834de89aed49 Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Thu, 3 Feb 2022 19:53:17 +0000
Subject: [PATCH] [custom_op]: support for optimized 1D dws SWU

---
 .../convolutioninputgenerator1d.py            | 210 +++++++++---------
 .../fpgadataflow/convert_to_hls_layers.py     |  10 -
 2 files changed, 111 insertions(+), 109 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index 97ec89daa..7e084fb9f 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -183,18 +183,36 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         num_output_elems = np.prod(folded_oshape[:-1])
         return num_output_elems
 
+    def get_swu_variant(self):
+        # checks which variant of the 1D ConvolutionInputGenerator (SWU) can be used
+        # We have 5 variants: ConvolutionInputGenerator_1D_parallel,
+        # ConvolutionInputGenerator_1D_dws_naive, ConvolutionInputGenerator_1D,
+        # ConvolutioninputGenerator_1D_dws, ConvolutionInputGenerator_1D_dws_stride
+        is_dws = self.get_nodeattr("depthwise")
+        is_strided = np.prod(self.get_nodeattr("Stride")) > 1
+        is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2
+        is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1
+        if self.use_parallel_window_output():
+            return "ConvolutionInputGenerator_1D_parallel"
+        if not is_dws:
+            return "ConvolutionInputGenerator_1D"
+        if is_dws:
+            if (is_strided and not is_stride_2) or (is_dilated):
+                return "ConvolutionInputGenerator_1D_dws_naive"
+            elif is_stride_2:
+                return "ConvolutionInputGenerator_1D_dws_stride"
+            else:
+                return "ConvolutionInputGenerator_1D_dws"
+
     def get_1d_conv_attrs_normalized(self):
         # support both (1, D) and (D, 1) cases transparently:
         # For the kernel, presenting the input data of size D as
         # [H, W] = [Y, X] = [1, D] or [D, 1]
-        # effectively gives the same result. Because the
-        # ConvolutionInputGenerator_NonSquare_Dilated(_dws) kernel currently only
-        # supports dilation>1 along the X-axis and the
-        # ConvolutionInputGenerator_NonSquare only works for stride>1 along the
-        # X-axis, we are working with the following assumption:
-        # the dummy ('1') dimension is the Y-dimension, i.e.
-        # images and kernels (and their attributes) of dimension
-        # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D]
+        # effectively gives the same result.
+        # For consistency and ease of programming, this function
+        # returns the attributes of the layer as follows:
+        # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
+        # The dummy ('1') dimension is the Y-dimension.
         ifm_ch = self.get_nodeattr("IFMChannels")
         k = self.get_nodeattr("ConvKernelDim")
         ifm_dim = self.get_nodeattr("IFMDim")
@@ -264,29 +282,27 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         dilation_h, dilation_w = dilation
 
         # since mmv != 1 is not supported yet, we set mmv for now to 1
-        mmv = 1
+        # mmv = 1
         # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
-        if self.use_parallel_window_output():
+        swu_variant = self.get_swu_variant()
+        if swu_variant == "ConvolutionInputGenerator_1D_parallel":
             exp_cycles = k_w + ofm_dim_w
-        elif dilation_h > 1 or dilation_w > 1:
+        elif swu_variant == "ConvolutionInputGenerator_1D":
+            exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd
+        elif swu_variant in [
+            "ConvolutionInputGenerator_1D_dws",
+            "ConvolutionInputGenerator_1D_dws_stride",
+        ]:
+            exp_cycles = (
+                1
+                + ofm_dim_w * k_w * ifm_ch / simd
+                + (ifm_ch / simd) * (k_w - 1)
+                - (k_w - 1)
+            )
+        elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
             cycles_read_block = ifm_dim_w * ifm_ch / simd
             cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
             exp_cycles = cycles_read_block + cycles_write_block
-        elif self.get_nodeattr("depthwise") == 1:
-            if stride_h > 1 or stride_w > 1:
-                cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
-                cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
-                max_cycles = max(cycles_write_block, cycles_read_block)
-                exp_cycles = (
-                    ifm_dim_w * k_h * dilation_h * (ifm_ch / simd)
-                    + ofm_dim_h * max_cycles
-                )
-            else:
-                cycles_read_block = ifm_ch / simd * (k_w - 1) - (k_w - 1)
-                cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
-                exp_cycles = cycles_read_block + cycles_write_block
-        else:
-            exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd
 
         return int(exp_cycles)
 
@@ -480,15 +496,14 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         ofm_dim_y, ofm_dim_x = ofm_dim
         k_y, k_x = k
         dilation_y, dilation_x = dilation
-        # For a 1d convolution with stride=[S,1] or [1,S], the finn-hlslib function
-        # of ConvInpGen must be created with [stride_y, stride_x] = [S, S].
-        # TODO: changes in finn-hlslib (slidingwindow.h)
-        stride_x = np.prod(stride)
+        stride_y, stride_x = stride
+        swu_variant = self.get_swu_variant()
 
-        if dilation_x > 1:
-            assert (
-                dilation_y == 1
-            ), "Dilation value greater than 1 along y-axis is not yet supported"
+        if swu_variant in [
+            "ConvolutionInputGenerator_1D_parallel",
+            "ConvolutionInputGenerator_1D",
+            "ConvolutionInputGenerator_1D_dws_stride",
+        ]:
             self.code_gen_dict["$DEFINES$"] = [
                 """
             #define ConvKernelDim1_x {}\n
@@ -496,9 +511,8 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
             #define Input_precision1 {}\n
             #define IFMDim1_x {}\n
             #define OFMDim1_x {}\n
-            #define SIMD1 {}\n
             #define Stride1_x {}\n
-            #define Dilation1_x {}\n
+            #define SIMD1 {}\n
             #define numReps {}
             """.format(
                     k_x,
@@ -506,14 +520,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                     ifm_precision,
                     ifm_dim_x,
                     ofm_dim_x,
-                    simd,
                     stride_x,
-                    dilation_x,
+                    simd,
                     numReps,
                 )
             ]
-        else:
-            ofm_dim = self.get_nodeattr("OFMDim")
+        if swu_variant == "ConvolutionInputGenerator_1D_dws":
             self.code_gen_dict["$DEFINES$"] = [
                 """
             #define ConvKernelDim1_x {}\n
@@ -522,7 +534,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
             #define IFMDim1_x {}\n
             #define OFMDim1_x {}\n
             #define SIMD1 {}\n
-            #define Stride1_x {}\n
             #define numReps {}
             """.format(
                     k_x,
@@ -531,7 +542,30 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                     ifm_dim_x,
                     ofm_dim_x,
                     simd,
+                    numReps,
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+            self.code_gen_dict["$DEFINES$"] = [
+                """
+            #define ConvKernelDim1_x {}\n
+            #define IFMChannels1 {}\n
+            #define Input_precision1 {}\n
+            #define IFMDim1_x {}\n
+            #define OFMDim1_x {}\n
+            #define Stride1_x {}\n
+            #define Dilation1_x {}\n
+            #define SIMD1 {}\n
+            #define numReps {}
+            """.format(
+                    k_x,
+                    ifm_ch,
+                    ifm_precision,
+                    ifm_dim_x,
+                    ofm_dim_x,
                     stride_x,
+                    dilation_x,
+                    simd,
                     numReps,
                 )
             ]
@@ -572,71 +606,49 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
             "ultra": "ap_resource_uram()",
         }
         hls_ram_style = map_to_hls_ram_style[ram_style]
-        (
-            ifm_ch,
-            ifm_dim,
-            ofm_dim,
-            k,
-            stride,
-            dilation,
-        ) = self.get_1d_conv_attrs_normalized()
-        stride_x = np.prod(stride)
+        swu_variant = self.get_swu_variant()
 
         # check which ConvolutionInputGenerator is needed
-        if self.use_parallel_window_output():
-            hls_call = "ConvolutionInputGenerator_1D_parallel"
+        if swu_variant == "ConvolutionInputGenerator_1D_parallel":
             self.code_gen_dict["$DOCOMPUTE$"] = [
                 """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
-                IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
+                IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
                 (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws_stride":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, Stride1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
+                )
+            ]
+        if swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, Stride1_x, Dilation1_x, SIMD1>
+                (in0, out, numReps, {});""".format(
+                    swu_variant, hls_ram_style
                 )
             ]
-        else:
-            dilation_h, dilation_w = self.get_nodeattr("Dilation")
-            if self.get_nodeattr("depthwise") == 1:
-                if dilation_h > 1 or dilation_w > 1:
-                    hls_call = "ConvolutionInputGenerator_1D_dilated_dws"
-                    self.code_gen_dict["$DOCOMPUTE$"] = [
-                        """{}<ConvKernelDim1_x, IFMChannels1,
-                        Input_precision1, IFMDim1_x, OFMDim1_x,
-                        SIMD1, Stride1_x, Dilation1_x>
-                        (in0, out, numReps, {});""".format(
-                            hls_call, hls_ram_style
-                        )
-                    ]
-                else:
-                    if stride_x > 1:
-                        # temporarily use old ConvolutionInputGenerator_NonSquare_dws
-                        # for depthwise with stride > 1
-                        # note that both x and y stride are set to same (hlslib bug)
-                        hls_call = "ConvolutionInputGenerator_NonSquare_dws"
-                        self.code_gen_dict["$DOCOMPUTE$"] = [
-                            """{}<ConvKernelDim1_x, 1, IFMChannels1,
-                            Input_precision1, IFMDim1_x, 1, OFMDim1_x, 1,
-                            SIMD1, Stride1_x, Stride1_x
-                            > (in0, out, numReps, {});""".format(
-                                hls_call, hls_ram_style
-                            )
-                        ]
-                    else:
-                        hls_call = "ConvolutionInputGenerator_1D_dws_lowbuffer"
-                        self.code_gen_dict["$DOCOMPUTE$"] = [
-                            """{}<ConvKernelDim1_x, IFMChannels1,
-                            Input_precision1, IFMDim1_x, OFMDim1_x,
-                            SIMD1> (in0, out, numReps, {});""".format(
-                                hls_call, hls_ram_style
-                            )
-                        ]
-            else:
-                hls_call = "ConvolutionInputGenerator_1D_lowbuffer"
-                self.code_gen_dict["$DOCOMPUTE$"] = [
-                    """{}<ConvKernelDim1_x, IFMChannels1,
-                    Input_precision1, IFMDim1_x, OFMDim1_x,
-                    Stride1_x, SIMD1> (in0, out, numReps, {});""".format(
-                        hls_call, hls_ram_style
-                    )
-                ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 2f83f23cb..c16bd00eb 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -203,21 +203,11 @@ class InferConvInpGen(Transformation):
                             % n.name
                         )
                         if dilation_h > 1 or dilation_w > 1:
-                            assert stride_h == 1 and stride_w == 1, (
-                                """%s: Stride value of greater than 1 is not supported for convolutions
-                                with dilation value greater than 1"""
-                                % n.name
-                            )
                             assert depthwise == 1, (
                                 """%s: Dilation value > 1 is only supported for
                                 1D depthwise separable convolutions"""
                                 % n.name
                             )
-                        if (stride_h > 1 or stride_w > 1) and (not depthwise):
-                            assert (
-                                stride_h < k_h and stride_w < k_w
-                            ), """%s: Stride value must be smaller than kernel dim
-                            for non-depthwise (dense) convolutions"""
                         ConvInpGen_node = helper.make_node(
                             "ConvolutionInputGenerator1D",
                             [ConvInpGen_input],
-- 
GitLab