diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv index 2d255a35edf97e28053545b89512a4b1415b6f57..0aa309f890140143be75123840708b6811d47a48 100644 --- a/finn-rtllib/swg/swg_template_default.sv +++ b/finn-rtllib/swg/swg_template_default.sv @@ -36,28 +36,19 @@ module $TOP_MODULE_NAME$_controller #( logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS-1; logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS-1; - logic [INCR_BITWIDTH-1:0] tail_incr_reg = 'x; assign addr_incr = ADDR_INCREMENT_MAP[State]; - assign tail_incr = tail_incr_reg; // combinational logic for tail_incr generation - uwire tail_incr_inner_condition; - generate - if (IS_DEPTHWISE) - assign tail_incr_inner_condition = (Counter_loop_kh >= 0); - else - assign tail_incr_inner_condition = 0; - endgenerate - - always @ (tail_incr_inner_condition, Counter_loop_w, Counter_loop_h) begin + uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0; + always_comb begin : blkTail if (tail_incr_inner_condition) - tail_incr_reg = 1; + tail_incr = 1; else if (Counter_loop_w >= 0) - tail_incr_reg = $TAIL_INCR_W$; + tail_incr = $TAIL_INCR_W$; else if (Counter_loop_h >= 0) - tail_incr_reg = $TAIL_INCR_H$; + tail_incr = $TAIL_INCR_H$; else - tail_incr_reg = $TAIL_INCR_LAST$; + tail_incr = $TAIL_INCR_LAST$; end // combinational next state logic @@ -132,13 +123,8 @@ module $TOP_MODULE_NAME$_cyclic_buffer_addressable #( $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH]; logic [WIDTH-1:0] Out = 'x; always_ff @(posedge clk) begin - if (!rst_n) begin - Out <= 'x; - end - else begin - if (read_enable) Out <= Ram[read_addr]; - if (write_enable) Ram[write_addr] <= data_in; - end + if (read_enable) Out <= Ram[read_addr]; + if (write_enable) Ram[write_addr] <= data_in; end assign data_out = Out; @@ -213,7 +199,7 @@ module $TOP_MODULE_NAME$_impl #( logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0] Newest_buffered_elem = -1; logic [$clog2(LAST_READ_ELEM+1)+1-1:0] Current_elem = 0; logic [$clog2(LAST_READ_ELEM+1)+1-1:0] First_elem_next_window = 0; - logic [$clog2(ELEM_PER_WINDOW) -1:0] K = 0; + logic [$clog2(ELEM_PER_WINDOW) -1:0] Position_in_window = 0; logic [$clog2(BUF_ELEM_TOTAL)+1 -1:0] Window_buffer_read_addr_reg = 0; logic [$clog2(BUF_ELEM_TOTAL)-1:0] Window_buffer_write_addr_reg = 0; @@ -255,7 +241,7 @@ module $TOP_MODULE_NAME$_impl #( Newest_buffered_elem <= -1; Current_elem <= 0; First_elem_next_window <= 0; - K <= 0; + Position_in_window <= 0; Window_buffer_read_addr_reg <= 0; Window_buffer_write_addr_reg <= 0; Fetching_done <= 0; @@ -295,10 +281,10 @@ module $TOP_MODULE_NAME$_impl #( Window_buffer_read_addr_reg <= ra + ra_correct; //keep track where we are within a window - K <= (K != ELEM_PER_WINDOW - 1)? K+1 : 0; + Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0; //update first element of next window to allow buffer overwrite up until that point - if (K == 0) + if (Position_in_window == 0) First_elem_next_window <= First_elem_next_window + tail_incr; //check if this is the last write cycle (Writing_done will be true afterwards) diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v index 1b470817d61af4140141d8478c4ae0d538678ad7..4411348beb7c338151955810021803f140c27b25 100644 --- a/finn-rtllib/swg/swg_template_wrapper.v +++ b/finn-rtllib/swg/swg_template_wrapper.v @@ -1,14 +1,16 @@ `timescale 1 ns / 1 ps module $TOP_MODULE_NAME$ ( - ap_clk, - ap_rst_n, - in0_V_TDATA, - in0_V_TVALID, - in0_V_TREADY, - out_V_TDATA, - out_V_TVALID, - out_V_TREADY +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) +input ap_rst_n, +input [BUF_IN_WIDTH-1:0] in0_V_TDATA, +input in0_V_TVALID, +output in0_V_TREADY, +output [BUF_OUT_WIDTH-1:0] out_V_TDATA, +output out_V_TVALID, +input out_V_TREADY ); // top-level parameters (set via code-generation) @@ -21,17 +23,6 @@ parameter MMV_OUT = $MMV_OUT$; parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) -input ap_clk; -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) -input ap_rst_n; -input [BUF_IN_WIDTH-1:0] in0_V_TDATA; -input in0_V_TVALID; -output in0_V_TREADY; -output [BUF_OUT_WIDTH-1:0] out_V_TDATA; -output out_V_TVALID; -input out_V_TREADY; - $TOP_MODULE_NAME$_impl #( .BIT_WIDTH(BIT_WIDTH), diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py index 98351942b9b4abd1568c9d465710a181e9cab86c..366dd396d199e02846046b00d4b829d0871060dc 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2022, Xilinx # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -72,7 +72,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): "OFMDim": ("ints", True, []), # [H, W] = [Y, X] "SIMD": ("i", True, 0), "M": ("i", False, 1), - "parallel_window": ("i", False, 0, {0, 1}), + "parallel_window": ("i", False, 0, {0}), "Stride": ("ints", True, []), # [H, W] = [Y, X] "Dilation": ("ints", True, []), # [H, W] = [Y, X] # FINN DataTypes for inputs, weights, outputs @@ -212,6 +212,49 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + def get_buffer_depth(self): + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + + k_h, k_w = k + h, w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + mmv_in = 1 + mmv_out = 1 + channel_factor = int(ifm_ch / simd) + + impl_style = self.select_impl_style() + if impl_style == "default": + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + + # add additional buffer space in case of stride > 1 + # this minimizes cycle count as it allows an earlier pre-load of inputs + buffer_depth = ( + buffer_min_size + + max( + 0, + ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + + max( + 0, + ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) + * channel_factor, + ) + ) + else: + buffer_depth = 0 + raise Exception("Requested impl. style not implemented") + return buffer_depth + def get_exp_cycles(self): simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") @@ -268,17 +311,11 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): def bram_estimation(self): simd = self.get_nodeattr("SIMD") ram_style = self.get_nodeattr("ram_style") - impl_style = self.select_impl_style() - # call codegen preparation to populate self.buffer_depth - if impl_style == "default": - self.prepare_codegen_default() - else: - raise Exception("Requested impl. style not implemented") # NOTE: Actual BRAM usage might be lower in some cases. # This does not account for the exact Vivado behavior yet. buffer_width = simd * self.get_input_datatype().bitwidth() - buffer_depth = self.buffer_depth + buffer_depth = self.get_buffer_depth() if ram_style == "block" or ram_style == "auto": if buffer_depth <= 512: ram_width = 36 @@ -321,15 +358,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): def lut_estimation(self): simd = self.get_nodeattr("SIMD") ram_style = self.get_nodeattr("ram_style") - impl_style = self.select_impl_style() - # call codegen preparation to populate self.buffer_depth - if impl_style == "default": - self.prepare_codegen_default() - else: - raise Exception("Requested impl. style not implemented") - buffer_width = simd * self.get_input_datatype().bitwidth() - buffer_depth = self.buffer_depth + buffer_depth = self.get_buffer_depth() if ram_style == "distributed": ram_luts = int(buffer_width * math.ceil(buffer_depth / 38)) else: @@ -339,15 +369,8 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): def uram_estimation(self): simd = self.get_nodeattr("SIMD") ram_style = self.get_nodeattr("ram_style") - impl_style = self.select_impl_style() - # call codegen preparation to populate self.buffer_depth - if impl_style == "default": - self.prepare_codegen_default() - else: - raise Exception("Requested impl. style not implemented") - buffer_width = simd * self.get_input_datatype().bitwidth() - buffer_depth = self.buffer_depth + buffer_depth = self.get_buffer_depth() if ram_style == "ultra": ram_depth = 4096 @@ -460,21 +483,7 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 ) * channel_factor - # add additional buffer space in case of stride > 1 - # this minimizes cycle count as it allows an earlier pre-load of input elements - buffer_actual_size = ( - buffer_min_size - + max( - 0, - ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor, - ) - + max( - 0, - ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) - * channel_factor, - ) - ) - self.buffer_depth = buffer_actual_size # for resource estimation + buffer_actual_size = self.get_buffer_depth() code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation @@ -643,9 +652,6 @@ class ConvolutionInputGenerator_rtl(HLSCustomOp): and stride_w <= ifm_dim_w ), "Illegal conv configuration: kernel or stride > FM dimension" - if k_h == 1 and k_w == 1: - assert simd == ifm_ch, "1x1 Kernel only supported in parallel mode (SIMD=C)" - # init folding config if self.get_nodeattr("parallel_window"): # mmv_in = M * 1 diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index 850bcf6616aebc251ebd12f9d766b89c43ee4208..540c217cbca8c47243a080ac493f19bd1c72abc8 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -132,7 +132,26 @@ class InferConvInpGen(Transformation): ) graph.node.insert(node_ind, padding_node) - if self.use_rtl_variant: + is_kernel_pointwise = k_h == 1 and k_w == 1 + is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w + is_square_kernel = k_h == k_w + is_equal_stride = stride_h == stride_w + is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( + k_h > 1 and k_w == 1 and ifm_dim_w == 1 + ) + + # Ensure that RTL variant is not inserted for unsupported configuration + is_rtl_variant_compatible = True + if is_kernel_pointwise: + is_rtl_variant_compatible = False + if self.use_rtl_variant: + warnings.warn( + """%s : RTL ConvInpGen requested for unsupported + configuration. Falling back to HLS implementation.""" + % n.name + ) + + if self.use_rtl_variant and is_rtl_variant_compatible: ConvInpGen_node = helper.make_node( "ConvolutionInputGenerator_rtl", [ConvInpGen_input], @@ -151,19 +170,11 @@ class InferConvInpGen(Transformation): inputDataType=dt.name, outputDataType=dt.name, depthwise=depthwise, - name="ConvolutionInputGenerator_rtl" + n.name, + name="ConvolutionInputGenerator_rtl_" + n.name, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) else: # Ensure that only supported HLS nodes are inserted - is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w - is_square_kernel = k_h == k_w - is_kernel_pointwise = k_h == 1 and k_w == 1 - is_equal_stride = stride_h == stride_w - is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( - k_h > 1 and k_w == 1 and ifm_dim_w == 1 - ) - if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: assert is_square_image, ( """%s : DownSampler currently only supports square diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 5c94272bad52fd6265a9bb0054fae87a3d77b93b..e24e24f1f8ebb2873c81617884cd333311d8aea9 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -172,11 +172,7 @@ class SetFolding(Transformation): "Expected SWU on DW op input, found " + swu_node.op_type ) elif op_type in simd_ops: - if op_type in [ - "ConvolutionInputGenerator", - "ConvolutionInputGenerator1D", - "ConvolutionInputGenerator_rtl", - ]: + if op_type.startswith("ConvolutionInputGenerator"): depthwise = node_inst.get_nodeattr("depthwise") if depthwise == 0: max_simd = node_inst.get_nodeattr("IFMChannels") diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py index 56438ac6b6c5ac835ca35d9e66073042e467224f..8c9f110c315089ec03354863bf2213963197217a 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -164,14 +164,10 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod inp_dict = {model.graph.input[0].name: x} assert oxe.compare_execution(model, new_model, inp_dict) - if use_rtl_swg: - downsampler_op_type = "ConvolutionInputGenerator_rtl" - else: - downsampler_op_type = "DownSampler" if kernel_size == 1 and stride > 1 and pad == 0: - assert new_model.graph.node[1].op_type == downsampler_op_type + assert new_model.graph.node[1].op_type == "DownSampler" if exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type(downsampler_op_type)[0] + node = new_model.get_nodes_by_op_type("DownSampler")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py index eeeb09329448f546f4a668fde3d32ffaa36f5aaf..5da1fa6eb1f63251769b9e88e06087cf51e863a1 100755 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py @@ -142,7 +142,7 @@ def prepare_inputs(input_tensor): # kernel size @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [[24, 24], [13, 13], [1, 14]]) +@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]]) # input channels @pytest.mark.parametrize("ifm_ch", [6]) # Stride