Merge branch 'feature/support_1d_swg_simple' of...

Merge branch 'feature/support_1d_swg_simple' of https://github.com/fpjentzsch/finn into fpjentzsch-feature/support_1d_swg_simple

Merge branch 'feature/support_1d_swg_simple' of...
Merge branch 'feature/support_1d_swg_simple' of https://github.com/fpjentzsch/finn into fpjentzsch-feature/support_1d_swg_simple
5072927e · Yaman Umuroglu · 918ba008 · a975909c · 5072927e · 5072927e
Commit 5072927e authored 4 years ago by Yaman Umuroglu
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -127,8 +127,12 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        wf = int((k_h * k_w * ifm_ch) // simd)
-        folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+        if self.use_parallel_window_output():
+            wf = int((ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+        else:
+            wf = int((k_h * k_w * ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
        return folded_oshape

    def make_shape_compatible_op(self, model):
@@ -156,8 +160,6 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
        return DataType[self.get_nodeattr("outputDataType")]

    def get_instream_width(self):
-        """Returns stream width, input and output stream width are equal for
-        the sliding window function"""
        ibits = self.get_input_datatype().bitwidth()
        simd = self.get_nodeattr("SIMD")
        ifm_ch = self.get_nodeattr("IFMChannels")
@@ -166,10 +168,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
        return in_width

    def get_outstream_width(self):
-        """Returns stream width, input and output stream width are equal for
-        the sliding window function, so the function to determine the input
-        stream width can be reused."""
-        return self.get_instream_width()
+        if self.use_parallel_window_output():
+            # feed all window pixels in parallel
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            return self.get_instream_width() * k_h * k_w
+        else:
+            # if parallel variant not in use: same width for output and input stream
+            return self.get_instream_width()

    def get_number_output_values(self):
        folded_oshape = self.get_folded_output_shape()
@@ -205,6 +210,22 @@ class ConvolutionInputGenerator1D(HLSCustomOp):

        return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)

+    def use_parallel_window_output(self):
+        # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
+        # feed window in parallel to the following layer, enabling full SIMD unfolding.
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
+            if self.get_nodeattr("depthwise") == 0:
+                if stride_h == 1 and stride_w == 1:
+                    if dilation_h == 1 and dilation_w == 1:
+                        return True
+
+        return False
+
    def get_exp_cycles(self):
        simd = self.get_nodeattr("SIMD")
        (
@@ -224,12 +245,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
        # since mmv != 1 is not supported yet, we set mmv for now to 1
        mmv = 1
        # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
-        cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
-        cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
-        max_cycles = max(cycles_write_block, cycles_read_block)
-        exp_cycles = (
-            ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
-        )
+        if self.use_parallel_window_output():
+            exp_cycles = k_w + ofm_dim_w
+        else:
+            cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+            cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
+            max_cycles = max(cycles_write_block, cycles_read_block)
+            exp_cycles = (
+                ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
+            )

        return int(exp_cycles)

@@ -522,39 +546,49 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
            "ultra": "ap_resource_uram()",
        }
        hls_ram_style = map_to_hls_ram_style[ram_style]
-        hls_call = "ConvolutionInputGenerator"
-        # check which ConvolutionInputGenerator is needed
-        dilation_h, dilation_w = self.get_nodeattr("Dilation")

-        hls_call += "_NonSquare"
-        if dilation_h > 1 or dilation_w > 1:
-            hls_call += "_Dilated"
-            if self.get_nodeattr("depthwise") == 1:
-                hls_call += "_dws"
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
-                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y,
-                Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
-                )
-            ]
-        elif self.get_nodeattr("depthwise") == 1:
-            hls_call += "_dws"
+        # check which ConvolutionInputGenerator is needed
+        if self.use_parallel_window_output():
+            hls_call = "ConvolutionInputGenerator_1D_parallel"
            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
-                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
+                """{}<ConvKernelDim1_x, IFMChannels1, Input_precision1,
+                IFMDim1_x, OFMDim1_x, SIMD1, Stride1_x>
                (in0, out, numReps, {});""".format(
                    hls_call, hls_ram_style
                )
            ]
        else:
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1,
-                IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y>
-                (in0, out, numReps, {});""".format(
-                    hls_call, hls_ram_style
-                )
-            ]
+            hls_call = "ConvolutionInputGenerator_NonSquare"
+            dilation_h, dilation_w = self.get_nodeattr("Dilation")
+            if dilation_h > 1 or dilation_w > 1:
+                hls_call += "_Dilated"
+                if self.get_nodeattr("depthwise") == 1:
+                    hls_call += "_dws"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y, Dilation1_x, Dilation1_y>
+                    (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
+            elif self.get_nodeattr("depthwise") == 1:
+                hls_call += "_dws"
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]
+            else:
+                self.code_gen_dict["$DOCOMPUTE$"] = [
+                    """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1,
+                    Input_precision1, IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y,
+                    SIMD1, Stride1_x, Stride1_y> (in0, out, numReps, {});""".format(
+                        hls_call, hls_ram_style
+                    )
+                ]

    def dataoutstrm(self):
        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -570,9 +604,16 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
        npy_out = "%s/output.npy" % code_gen_dir
        oshape = self.get_folded_output_shape()
        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+        if self.use_parallel_window_output():
+            # pass the number of pixels in the folded output to apintstream2npy, needed
+            # to unpack the ouput correctly and reverse only the inner SIMD dimension
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            multi_pixel_out = k_h * k_w
+        else:
+            multi_pixel_out = 1

        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);'
            % (
                packed_hls_type,
                elem_hls_type,
@@ -580,6 +621,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
                npy_type,
                oshape_cpp_str,
                npy_out,
+                multi_pixel_out,
            )
        ]

@@ -587,12 +629,21 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
        self.code_gen_dict["$SAVEASCNPY$"] = []

    def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
-                hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
-                self.onnx_node.name
-            )
-        ]
+        if self.use_parallel_window_output():
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                    hls::stream<ap_uint<ConvKernelDim1_x*SIMD1*Input_precision1>>
+                    &out)""".format(
+                    self.onnx_node.name
+                )
+            ]
+        else:
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0,
+                    hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format(
+                    self.onnx_node.name
+                )
+            ]

    def pragmas(self):
        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]

--- a/src/finn/qnn-data/cpp/npy2apintstream.hpp
+++ b/src/finn/qnn-data/cpp/npy2apintstream.hpp
@@ -45,29 +45,34 @@ void npy2apintstream(const char * npy_path, hls::stream<PackedT> & out_stream, b
 }

 template <typename PackedT, typename ElemT, int ElemBits, typename NpyT>
-void apintstream2npy(hls::stream<PackedT> & in_stream, const std::vector<size_t> & shape, const char * npy_path, bool reverse_inner = true, size_t numReps = 1) {
+void apintstream2npy(hls::stream<PackedT> & in_stream, const std::vector<size_t> & shape, const char * npy_path, bool reverse_inner = true, size_t numReps = 1, size_t multi_pixel_out = 1) {
  for(size_t rep = 0; rep < numReps; rep++) {
    std::vector<NpyT> data_to_save;
    size_t outer_dim_elems = 1;
    for(size_t dim = 0; dim < shape.size()-1; dim++) {
      outer_dim_elems *= shape[dim];
    }
-    size_t inner_dim_elems = shape[shape.size()-1];
-    DEBUG_APINTSTREAM2NPY("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems)
+    size_t inner_dim_elems = shape[shape.size()-1] / multi_pixel_out;
+    DEBUG_APINTSTREAM2NPY("n_outer " << outer_dim_elems << " n_inner " << inner_dim_elems << " n_multi_pixel_out " << multi_pixel_out)
    for(size_t outer_elem = 0; outer_elem < outer_dim_elems; outer_elem++) {
      PackedT packed_elem;
      in_stream >> packed_elem;
      DEBUG_APINTSTREAM2NPY("packed hls elem " << std::hex << packed_elem << std::dec)
-      for(size_t ii = 0; ii < inner_dim_elems; ii++) {
-        size_t i = reverse_inner ? inner_dim_elems-ii-1 : ii;
-        ap_uint<ElemBits> tmp_elem = packed_elem((i+1)*ElemBits-1, i*ElemBits);
-        // important: don't init elem = reinterpret_cast.. directly here
-        // this causes weird behavior for conversion to NpyT afterwards
-        ElemT elem;
-        elem = reinterpret_cast<ElemT&>(tmp_elem);
-        NpyT npyt = (NpyT) elem;
-        DEBUG_APINTSTREAM2NPY("elem " << elem << " NpyT " << npyt)
-        data_to_save.push_back(npyt);
+      for(size_t ii_multi_pixel_out = 0; ii_multi_pixel_out < multi_pixel_out; ii_multi_pixel_out++) {
+        // loop over multi_pixel_out blocks of inner_dim_elems separately,
+        // so that reverse_inner is not applied across multiple pixels
+        for(size_t ii = 0; ii < inner_dim_elems; ii++) {
+          size_t i = ii_multi_pixel_out*inner_dim_elems;
+          i += reverse_inner ? inner_dim_elems-ii-1 : ii;
+          ap_uint<ElemBits> tmp_elem = packed_elem((i+1)*ElemBits-1, i*ElemBits);
+          // important: don't init elem = reinterpret_cast.. directly here
+          // this causes weird behavior for conversion to NpyT afterwards
+          ElemT elem;
+          elem = reinterpret_cast<ElemT&>(tmp_elem);
+          NpyT npyt = (NpyT) elem;
+          DEBUG_APINTSTREAM2NPY("elem " << elem << " NpyT " << npyt)
+          data_to_save.push_back(npyt);
+        }
      }
    }
    cnpy::npy_save(npy_path, &data_to_save[0], shape, "w");