diff --git a/finn-rtllib/swg/swg_hdl_template.v b/finn-rtllib/swg/swg_hdl_template.v index 195075724565081d11c055351f6199153d37746f..44fd41abab9253cdbf7427fe343f836092109246 100755 --- a/finn-rtllib/swg/swg_hdl_template.v +++ b/finn-rtllib/swg/swg_hdl_template.v @@ -8,8 +8,9 @@ `timescale 1 ns / 1 ps module $TOP_MODULE_NAME$_wb #( - parameter IN_WIDTH = 1, //c*bit-width - parameter OUT_WIDTH = 1, //c*bit-width*MMV_out + parameter IN_WIDTH = 1, //bit-width*C*MMV_in + parameter OUT_ELEM_WIDTH = 1, //bit-width*C + parameter OUT_WIDTH = 1, //bit-width*C*MMV_out parameter BUFFER_ELEM_TOTAL = 1 ) ( @@ -65,10 +66,11 @@ module $TOP_MODULE_NAME$ ( //parameters parameter BIT_WIDTH = $BIT_WIDTH$; parameter SIMD = $SIMD$; //assuming SIMD=C for now -parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1 for now -parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K for now -parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //c*bit-width -parameter BUF_OUT_WIDTH = BUF_IN_WIDTH * MMV_OUT; //c*bit-width*MMV_out +parameter MMV_IN = $MMV_IN$; //assuming MMV_IN=1*M for now +parameter MMV_OUT = $MMV_OUT$; //assuming MMV_OUT=K*M for now +parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; //bit-width*C*MMV_in +parameter BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; //bit-width*C +parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; //bit-width*C*MMV_out parameter CYCLES_TOTAL = $CYCLES_TOTAL$; parameter BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$; @@ -92,6 +94,7 @@ wire window_buffer_shift_enable; $TOP_MODULE_NAME$_wb #( .IN_WIDTH(BUF_IN_WIDTH), + .OUT_ELEM_WIDTH(BUF_OUT_ELEM_WIDTH), .OUT_WIDTH(BUF_OUT_WIDTH), .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL) ) diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py index 2e8e8ec75e95440f2d5131fcaff4724792711219..55687aa5d21344aab5bbefe69b6a074b39240190 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -79,6 +79,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): "IFMDim": ("ints", True, []), # [H, W] = [Y, X] "OFMDim": ("ints", True, []), # [H, W] = [Y, X] "SIMD": ("i", True, 0), + "M": ("i", True, 1), "Stride": ("ints", True, []), # [H, W] = [Y, X] "Dilation": ("ints", True, []), # [H, W] = [Y, X] # FINN DataTypes for inputs, weights, outputs @@ -111,9 +112,15 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + #folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + #round up to support ifm_dim % M != 0 + if ifm_dim_w == 1: + folded_ishape = (1, math.ceil(ifm_dim_h/M), ifm_dim_w, wf, int(simd*M)) + else: + folded_ishape = (1, ifm_dim_h, math.ceil(ifm_dim_w/M), wf, int(simd*M)) return folded_ishape def get_normal_output_shape(self): @@ -135,13 +142,18 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): stride_h, stride_w = self.get_nodeattr("Stride") dilation_h, dilation_w = self.get_nodeattr("Dilation") simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") pad = 0 ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" if self.use_parallel_window_output(): wf = int((ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + #folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + if ofm_dim_w == 1: + folded_oshape = (1, int(ofm_dim_h/M), ofm_dim_w, wf, k_h * k_w * int(simd*M)) + else: + folded_oshape = (1, ofm_dim_h, int(ofm_dim_w/M), wf, k_h * k_w * int(simd*M)) else: wf = int((k_h * k_w * ifm_ch) // simd) folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) @@ -175,8 +187,9 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): ibits = self.get_input_datatype().bitwidth() simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") + M = self.get_nodeattr("M") assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - in_width = simd * ibits + in_width = simd * ibits * M return in_width def get_outstream_width(self): @@ -377,6 +390,15 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): export_idt = DataType["BINARY"] else: export_idt = self.get_input_datatype() + + # pad test input stream to work when IFMdim % M != 0 + # during normal operation, the AXI Stream should not care, in the last cycle garbage elements are read but not used + # ToDo: only works for 1D case + mmv_stream_padding_px = int((np.prod(folded_ishape) - np.prod(exp_ishape)) / exp_ishape[-1]) + if exp_ishape [2] == 1: + inp = np.pad(inp, ((0,0),(0,mmv_stream_padding_px),(0,0),(0,0)), 'constant') + else: + inp = np.pad(inp, ((0,0),(0,0),(0,mmv_stream_padding_px),(0,0)), 'constant') # reshape input into folded form inp = inp.reshape(folded_ishape) # make copy before saving array @@ -460,22 +482,23 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): n = 1 h, w = ifm_dim - c = 1#ifm_ch not considered atm (always parallelize across c) + c = 1 # ifm_ch not considered atm (always parallelize across c) k_h, k_w = k - pad = [0,0,0,0] + pad = [0,0,0,0] # padding happens in separate padding node pad_val = 0 stride_h, stride_w = stride dilation_h, dilation_w = dilation conv_c = 99 # init folding config + M = self.get_nodeattr("M") simd = self.get_nodeattr("SIMD") - mmv_in = 1 - mmv_out = k_h*k_w + mmv_in = 1*M + mmv_out = k_h*k_w*M assert simd==ifm_ch, "Constraint violated: SIMD = C" - assert mmv_in==1, "Constraint violated: MMV_IN = 1" - assert mmv_out==k_h*k_w, "Constraint violated: mmv_out = K" + assert mmv_in==1*M, "Constraint violated: MMV_IN = 1" # *M + assert mmv_out==k_h*k_w*M, "Constraint violated: mmv_out = K" # *M # how many "unused" registers are allowed between buffer positions that will be accessed in parallel # example: @@ -552,7 +575,18 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): f_debug.write("\n"+str(idx_w)) idx_px = idx_h*w+idx_w - f_debug.write("\n"+"sequential pixel indices") + f_debug.write("\n"+"sequential pixel indices (shape %s" % str(idx_px.shape)) + f_debug.write("\n"+str(idx_px)) + + output_elem, output_cycles = idx_px.shape + # ToDo: what happens when output_cycles=OFMdim % M != 0 + # ...try to support IFMdim % M != 0 first, so we can work with the usual k=3 where OFMdim = IFMdim - -2 + # the additional garbage input elements that are read in the last cycle are not read by any window anyway + idx_px = idx_px.transpose() + idx_px = idx_px.reshape((int(output_cycles/M), int(output_elem*M))) + idx_px = idx_px.transpose() + + f_debug.write("\n"+"sequential pixel indices, MMV_out grouping (shape %s" % str(idx_px.shape)) f_debug.write("\n"+str(idx_px)) buffer = [] @@ -565,23 +599,29 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): idx_px_relative = idx_px.copy() # compute schedule and buffer read pattern - Y, X = idx_px_relative.shape - for x in range(X): + output_elem, output_cycles = idx_px_relative.shape + for x in range(output_cycles): # load missing inputs into buffer - for y in range(Y): + for y in range(output_elem): while int(idx_px_relative[y,x]) not in buffer: - buffer.append(next_in_px) - next_in_px += 1 + # load M inputs at once (keep "buffer" list 1D for now, handle actual 2D buffer generation later) + for m in range(M): + buffer.append(next_in_px) + next_in_px += 1 schedule_write.append(1) schedule_read.append(0) # discard unused buffer elements (assumes in-order access) oldest_px = min(idx_px_relative[:,x]) - while buffer[0] < oldest_px: - buffer.pop(0) + #while buffer[0] < oldest_px: + #check whether M elements can be shifted out, not just the single oldest one + while all([buffer[i] < oldest_px for i in range(M)]): + # M buffer elements are shifted out at once + for m in range(M): + buffer.pop(0) # adjust relative buffer index - for y in range(Y): + for y in range(output_elem): idx_px_relative[y,x] -= oldest_px # record max needed buffer depth @@ -595,14 +635,16 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): if next_in_px > (h_padded*w_padded-1): schedule_write.append(0) else: - buffer.append(next_in_px) - next_in_px += 1 + # load M inputs at once + for m in range(M): + buffer.append(next_in_px) + next_in_px += 1 schedule_write.append(1) # find buffer access patterns buffer_access_patterns = [] - for x in range(X): + for x in range(output_cycles): if idx_px_relative[:,x].tolist() not in buffer_access_patterns: buffer_access_patterns.append(idx_px_relative[:,x].tolist()) @@ -636,10 +678,13 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_max_size)] # determine buffer partitioning into REG FIFOs (parallel access) and BRAM FIFOs (line buffers) + # ToDo: this part doesn't fully account for M (2D buffer) yet assert len(buffer_access_patterns) == 1, "ERROR: Buffer access pattern is not static" buf_static_access_pattern = buffer_access_patterns[0] reg_fifos = [] + reg_fifos_depth = [] bram_fifos = [] + bram_fifos_depth = [] current = [] for i in range(len(buf_static_access_pattern)): access_idx = buf_static_access_pattern[i] @@ -647,6 +692,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): current.append(access_idx) else: # assume non-decreasing index order in access pattern + # ToDo: this assumption does not hold for M>1 case (2D buffer) distance = access_idx - max(current) if not (distance-1 > REG_BRAM_THRESHOLD): for i in range(distance-1): @@ -657,11 +703,14 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): else: # assign skipped accesses to new BRAM FIFO bram_fifos.append([-1]*(distance-1)) + bram_fifos_depth.append((distance-1)/M) # start with new REG FIFO reg_fifos.append(current) + reg_fifos_depth.append(math.ceil((max(current)+1)/M)) current = [] current.append(access_idx) reg_fifos.append(current) + reg_fifos_depth.append(math.ceil((max(current)+1)/M)) f_debug.write("\n"+"Buffer partitioning using REG_BRAM_THRESHOLD=%d" % REG_BRAM_THRESHOLD) f_debug.write("\n"+"%d REG FIFOs (parallel read access):" % len(reg_fifos)) @@ -674,7 +723,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): code_gen_dict["$GENERATE_REG_FIFOS$"].append( """parameter reg_fifo_{id}_len = {len}; reg [IN_WIDTH-1:0] reg_fifo_{id} [reg_fifo_{id}_len-1:0]; - """.format(id=i, len=len(reg_fifos[i]))) + """.format(id=i, len=reg_fifos_depth[i])) #todo: generate actual bram shift buffers instead of regs code_gen_dict["$GENERATE_BRAM_FIFOS$"] = [] @@ -682,16 +731,23 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): code_gen_dict["$GENERATE_BRAM_FIFOS$"].append( """parameter bram_fifo_{id}_len = {len}; reg [IN_WIDTH-1:0] bram_fifo_{id} [bram_fifo_{id}_len-1:0]; - """.format(id=i, len=len(bram_fifos[i]))) + """.format(id=i, len=bram_fifos_depth[i])) code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = [] out_idx = mmv_out-1 for fifo_id, reg_fifo in enumerate(reg_fifos): for fifo_idx, access_idx in enumerate(reg_fifo): if(access_idx != -1): + #code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append( + # "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format( + # out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx + # ) + #) code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append( - "assign data_out[IN_WIDTH*{out_idx}+:IN_WIDTH] = reg_fifo_{fifo_id}[{fifo_idx}]; //{access_idx}".format( - out_idx=out_idx, fifo_id=fifo_id, fifo_idx=fifo_idx, access_idx=access_idx + "assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] = reg_fifo_{fifo_id}[{access_idx}][OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];".format( + out_idx=out_idx, fifo_id=fifo_id, + access_idx=reg_fifos_depth[fifo_id]-1-int((max(reg_fifo)-access_idx)/M), + mmv_idx=(max(reg_fifo)-access_idx)%M ) ) # reversal: out_idx=0 -> oldest buffer element -> highest access_idx @@ -762,9 +818,6 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append( "assign write_state = WRITE_SCHEDULE[cycle_last];" ) - #code_gen_dict["$GENERATE_WRITE_SCHEDULE$"].append( - # "assign write_state_next = WRITE_SCHEDULE[cycle_next];" - #) with open("/workspace/finn/finn-rtllib/swg/swg_hdl_template.v", "r") as f: template = f.read() diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py index 0845dc2fcad42257336027e1e03bdee9c17a946f..ef1fda8e31eab93c8a79167a51d152400f317bc0 100755 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -90,7 +90,7 @@ def make_single_im2col_modelwrapper( def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 + k, ifm_ch, ifm_dim, ofm_dim, simd, m, stride, dilation, idt, dw=0 ): k_h, k_w = k ifm_dim_h, ifm_dim_w = ifm_dim @@ -117,6 +117,7 @@ def make_single_slidingwindow_modelwrapper( IFMDim=[ifm_dim_h, ifm_dim_w], OFMDim=[ofm_dim_h, ofm_dim_w], SIMD=simd, + M=m, Stride=[stride_h, stride_w], Dilation=[dilation_h, dilation_w], inputDataType=idt.name, @@ -153,7 +154,7 @@ def prepare_inputs(input_tensor): # kernel size @pytest.mark.parametrize("k", [[3, 1]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [[8, 1]]) +@pytest.mark.parametrize("ifm_dim", [[10, 1]]) # input channels @pytest.mark.parametrize("ifm_ch", [2]) # Stride @@ -164,6 +165,8 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("exec_mode", ["rtlsim"]) # input channel parallelism ("SIMD") @pytest.mark.parametrize("simd", [2]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1, 2, 4]) # depthwise @pytest.mark.parametrize("dw", [0]) # Flip dimensions @@ -171,7 +174,7 @@ def prepare_inputs(input_tensor): @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_slidingwindow_rtl( - idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip + idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, m, dw, flip ): if flip: k = k[::-1] @@ -203,6 +206,7 @@ def test_fpgadataflow_slidingwindow_rtl( ifm_dim=ifm_dim, ofm_dim=ofm_dim, simd=simd, + m=m, stride=stride, dilation=dilation, idt=idt,