Basic MMV in/out implementation for a special 1D case

748049db · Felix Jentzsch · 888d69c3 · 748049db · 748049db · 748049db
Commit 748049db authored 3 years ago by Felix Jentzsch
--- a/finn-rtllib/swg/swg_hdl_template.v
+++ b/finn-rtllib/swg/swg_hdl_template.v
@@ -8,8 +8,9 @@
 `timescale 1 ns / 1 ps 
 module $TOP_MODULE_NAME$_wb
 #(
-    parameter IN_WIDTH = 1, //c*bit-width
-    parameter OUT_WIDTH = 1, //c*bit-width*MMV_out
+    parameter IN_WIDTH = 1, //bit-width*C*MMV_in
+    parameter OUT_ELEM_WIDTH = 1, //bit-width*C
+    parameter OUT_WIDTH = 1, //bit-width*C*MMV_out
    parameter BUFFER_ELEM_TOTAL = 1
 )
 (
@@ -65,10 +66,11 @@ module $TOP_MODULE_NAME$ (
 //parameters
 parameter BIT_WIDTH = $BIT_WIDTH$;
 parameter SIMD = $SIMD$; //assuming SIMD=C for now
-parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1 for now
-parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K for now
-parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //c*bit-width
-parameter BUF_OUT_WIDTH = BUF_IN_WIDTH * MMV_OUT; //c*bit-width*MMV_out
+parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1*M for now
+parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K*M for now
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //bit-width*C*MMV_in
+parameter BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; //bit-width*C
+parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; //bit-width*C*MMV_out

 parameter CYCLES_TOTAL = $CYCLES_TOTAL$;
 parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$;
@@ -92,6 +94,7 @@ wire window_buffer_shift_enable;
 $TOP_MODULE_NAME$_wb
 #(
    .IN_WIDTH(BUF_IN_WIDTH),
+    .OUT_ELEM_WIDTH(BUF_OUT_ELEM_WIDTH),
    .OUT_WIDTH(BUF_OUT_WIDTH),
    .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL)
 )

--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -79,6 +79,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
            "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
            "SIMD": ("i", True, 0),
+            "M": ("i", True, 1),
            "Stride": ("ints", True, []),  # [H, W] = [Y, X]
            "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
            # FINN DataTypes for inputs, weights, outputs
@@ -111,9 +112,15 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
        ifm_ch = self.get_nodeattr("IFMChannels")
        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
        wf = int(ifm_ch / simd)
-        folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        #folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        #round up to support ifm_dim % M != 0
+        if ifm_dim_w == 1:
+            folded_ishape = (1, math.ceil(ifm_dim_h/M), ifm_dim_w, wf, int(simd*M))
+        else:
+            folded_ishape = (1, ifm_dim_h, math.ceil(ifm_dim_w/M), wf, int(simd*M))
        return folded_ishape

    def get_normal_output_shape(self):
@@ -135,13 +142,18 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
        stride_h, stride_w = self.get_nodeattr("Stride")
        dilation_h, dilation_w = self.get_nodeattr("Dilation")
        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
        pad = 0
        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
        if self.use_parallel_window_output():
            wf = int((ifm_ch) // simd)
-            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+            #folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+            if ofm_dim_w == 1:
+                folded_oshape = (1, int(ofm_dim_h/M), ofm_dim_w, wf, k_h * k_w * int(simd*M))
+            else:
+                folded_oshape = (1, ofm_dim_h, int(ofm_dim_w/M), wf, k_h * k_w * int(simd*M))
        else:
            wf = int((k_h * k_w * ifm_ch) // simd)
            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
@@ -175,8 +187,9 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
        ibits = self.get_input_datatype().bitwidth()
        simd = self.get_nodeattr("SIMD")
        ifm_ch = self.get_nodeattr("IFMChannels")
+        M = self.get_nodeattr("M")
        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        in_width = simd * ibits
+        in_width = simd * ibits * M
        return in_width

    def get_outstream_width(self):
@@ -377,6 +390,15 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
            export_idt = DataType["BINARY"]
        else:
            export_idt = self.get_input_datatype()
+
+        # pad test input stream to work when IFMdim % M != 0
+        # during normal operation, the AXI Stream should not care, in the last cycle garbage elements are read but not used
+        # ToDo: only works for 1D case
+        mmv_stream_padding_px = int((np.prod(folded_ishape) - np.prod(exp_ishape)) / exp_ishape[-1])
+        if exp_ishape [2] == 1:
+            inp = np.pad(inp, ((0,0),(0,mmv_stream_padding_px),(0,0),(0,0)), 'constant')
+        else:
+            inp = np.pad(inp, ((0,0),(0,0),(0,mmv_stream_padding_px),(0,0)), 'constant')
        # reshape input into folded form
        inp = inp.reshape(folded_ishape)
        # make copy before saving array
@@ -460,22 +482,23 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):

        n = 1
        h, w = ifm_dim
-        c = 1#ifm_ch not considered atm (always parallelize across c)
+        c = 1 # ifm_ch not considered atm (always parallelize across c)
        k_h, k_w = k
-        pad = [0,0,0,0]
+        pad = [0,0,0,0] # padding happens in separate padding node
        pad_val = 0
        stride_h, stride_w = stride
        dilation_h, dilation_w = dilation
        conv_c = 99

        # init folding config
+        M = self.get_nodeattr("M")
        simd = self.get_nodeattr("SIMD")
-        mmv_in = 1
-        mmv_out = k_h*k_w
+        mmv_in = 1*M
+        mmv_out = k_h*k_w*M

        assert simd==ifm_ch, "Constraint violated: SIMD = C"
-        assert mmv_in==1, "Constraint violated: MMV_IN = 1"
-        assert mmv_out==k_h*k_w, "Constraint violated: mmv_out = K"
+        assert mmv_in==1*M, "Constraint violated: MMV_IN = 1" # *M
+        assert mmv_out==k_h*k_w*M, "Constraint violated: mmv_out = K" # *M

        # how many "unused" registers are allowed between buffer positions that will be accessed in parallel
        # example:
@@ -552,7 +575,18 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
        f_debug.write("\n"+str(idx_w))

        idx_px = idx_h*w+idx_w
-        f_debug.write("\n"+"sequential pixel indices")
+        f_debug.write("\n"+"sequential pixel indices (shape %s" % str(idx_px.shape))
+        f_debug.write("\n"+str(idx_px))
+
+        output_elem, output_cycles = idx_px.shape
+        # ToDo: what happens when output_cycles=OFMdim % M != 0
+        # ...try to support IFMdim % M != 0 first, so we can work with the usual k=3 where OFMdim = IFMdim - -2
+        # the additional garbage input elements that are read in the last cycle are not read by any window anyway
+        idx_px = idx_px.transpose()
+        idx_px = idx_px.reshape((int(output_cycles/M), int(output_elem*M)))
+        idx_px = idx_px.transpose()
+
+        f_debug.write("\n"+"sequential pixel indices, MMV_out grouping (shape %s" % str(idx_px.shape))
        f_debug.write("\n"+str(idx_px))

        buffer = []
@@ -565,23 +599,29 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
        idx_px_relative = idx_px.copy()

        # compute schedule and buffer read pattern
-        Y, X = idx_px_relative.shape
-        for x in range(X):
+        output_elem, output_cycles = idx_px_relative.shape
+        for x in range(output_cycles):
            # load missing inputs into buffer
-            for y in range(Y):
+            for y in range(output_elem):
                while int(idx_px_relative[y,x]) not in buffer:
-                    buffer.append(next_in_px)
-                    next_in_px += 1
+                    # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later)
+                    for m in range(M):
+                        buffer.append(next_in_px)
+                        next_in_px += 1
                    schedule_write.append(1)
                    schedule_read.append(0)
            
            # discard unused buffer elements (assumes in-order access)
            oldest_px = min(idx_px_relative[:,x])
-            while buffer[0] < oldest_px:
-                buffer.pop(0)
+            #while buffer[0] < oldest_px:
+            #check whether M elements can be shifted out, not just the single oldest one
+            while all([buffer[i] < oldest_px for i in range(M)]):
+                # M buffer elements are shifted out at once
+                for m in range(M):
+                    buffer.pop(0)
                
            # adjust relative buffer index
-            for y in range(Y):
+            for y in range(output_elem):
                idx_px_relative[y,x] -= oldest_px
                
            # record max needed buffer depth
@@ -595,14 +635,16 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
            if next_in_px > (h_padded*w_padded-1):
                schedule_write.append(0)
            else:
-                buffer.append(next_in_px)
-                next_in_px += 1
+                # load M inputs at once
+                for m in range(M):
+                    buffer.append(next_in_px)
+                    next_in_px += 1
                schedule_write.append(1)


        # find buffer access patterns
        buffer_access_patterns = []
-        for x in range(X):
+        for x in range(output_cycles):
            if idx_px_relative[:,x].tolist() not in buffer_access_patterns:
                buffer_access_patterns.append(idx_px_relative[:,x].tolist())
                
@@ -636,10 +678,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)]
        
        # determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers)
+        # ToDo: this part doesn't fully account for M (2D buffer) yet
        assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static"
        buf_static_access_pattern = buffer_access_patterns[0]
        reg_fifos = []
+        reg_fifos_depth = []
        bram_fifos = []
+        bram_fifos_depth = []
        current = []
        for i in range(len(buf_static_access_pattern)):
            access_idx = buf_static_access_pattern[i]
@@ -647,6 +692,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                current.append(access_idx)
            else:
                # assume non-decreasing index order in access pattern
+                # ToDo: this assumption does not hold for M>1 case (2D buffer)
                distance = access_idx - max(current)
                if not (distance-1 > REG_BRAM_THRESHOLD):
                    for i in range(distance-1):
@@ -657,11 +703,14 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
                else:
                    # assign skipped accesses to new BRAM FIFO
                    bram_fifos.append([-1]*(distance-1))
+                    bram_fifos_depth.append((distance-1)/M)
                    # start with new REG FIFO
                    reg_fifos.append(current)
+                    reg_fifos_depth.append(math.ceil((max(current)+1)/M))
                    current = []
                    current.append(access_idx)
        reg_fifos.append(current)
+        reg_fifos_depth.append(math.ceil((max(current)+1)/M))

        f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD)
        f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos))
@@ -674,7 +723,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
            code_gen_dict["$GENERATE_REG_FIFOS$"].append(
                """parameter reg_fifo_{id}_len = {len};
                reg [IN_WIDTH-1:0] reg_fifo_{id} [reg_fifo_{id}_len-1:0];
-                """.format(id=i, len=len(reg_fifos[i])))
+                """.format(id=i, len=reg_fifos_depth[i]))
        
        #todo: generate actual bram shift buffers instead of regs
        code_gen_dict["$GENERATE_BRAM_FIFOS$"] = []
@@ -682,16 +731,23 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
            code_gen_dict["$GENERATE_BRAM_FIFOS$"].append(
                """parameter bram_fifo_{id}_len = {len};
                reg [IN_WIDTH-1:0] bram_fifo_{id} [bram_fifo_{id}_len-1:0];
-                """.format(id=i, len=len(bram_fifos[i])))
+                """.format(id=i, len=bram_fifos_depth[i]))

        code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = []
        out_idx = mmv_out-1
        for fifo_id, reg_fifo in enumerate(reg_fifos):
            for fifo_idx, access_idx in enumerate(reg_fifo):
                if(access_idx != -1):
+                    #code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
+                    #    "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format(
+                    #        out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx
+                    #    )
+                    #)
                    code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append(
-                        "assign data_out[IN_WIDTH*{out_idx}+:IN_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format(
-                            out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx
+                        "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}][OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format(
+                            out_idx=out_idx, fifo_id=fifo_id, 
+                            access_idx=reg_fifos_depth[fifo_id]-1-int((max(reg_fifo)-access_idx)/M), 
+                            mmv_idx=(max(reg_fifo)-access_idx)%M
                        )
                    )
                    # reversal: out_idx=0 -> oldest buffer element -> highest access_idx
@@ -762,9 +818,6 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp):
        code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append(
            "assign write_state = WRITE_SCHEDULE[cycle_last];"
        )
-        #code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append(
-        #    "assign write_state_next = WRITE_SCHEDULE[cycle_next];"
-        #)

        with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template.v", "r") as f:
            template = f.read()

--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -90,7 +90,7 @@ def make_single_im2col_modelwrapper(


 def make_single_slidingwindow_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, stride, dilation, idt, dw=0
 ):
    k_h, k_w = k
    ifm_dim_h, ifm_dim_w = ifm_dim
@@ -117,6 +117,7 @@ def make_single_slidingwindow_modelwrapper(
        IFMDim=[ifm_dim_h, ifm_dim_w],
        OFMDim=[ofm_dim_h, ofm_dim_w],
        SIMD=simd,
+        M=m,
        Stride=[stride_h, stride_w],
        Dilation=[dilation_h, dilation_w],
        inputDataType=idt.name,
@@ -153,7 +154,7 @@ def prepare_inputs(input_tensor):
 # kernel size
 @pytest.mark.parametrize("k", [[3, 1]])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [[8, 1]])
+@pytest.mark.parametrize("ifm_dim", [[10, 1]])
 # input channels
 @pytest.mark.parametrize("ifm_ch", [2])
 # Stride
@@ -164,6 +165,8 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("exec_mode", ["rtlsim"])
 # input channel parallelism ("SIMD")
 @pytest.mark.parametrize("simd", [2])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1, 2, 4])
 # depthwise
 @pytest.mark.parametrize("dw", [0])
 # Flip dimensions
@@ -171,7 +174,7 @@ def prepare_inputs(input_tensor):
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_slidingwindow_rtl(
-    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip
+    idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, m, dw, flip
 ):
    if flip:
        k = k[::-1]
@@ -203,6 +206,7 @@ def test_fpgadataflow_slidingwindow_rtl(
        ifm_dim=ifm_dim,
        ofm_dim=ofm_dim,
        simd=simd,
+        m=m,
        stride=stride,
        dilation=dilation,
        idt=idt,