diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 3f400053df8de6ec1e53e39fb5a3edee15f3ab30..a97267e7b1c3f368da078d6c231efe431ccf4eb8 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -61,12 +61,13 @@ class ConvolutionInputGenerator(HLSCustomOp): def get_nodeattr_types(self): my_attrs = { - "ConvKernelDim": ("i", True, 0), + "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] "IFMChannels": ("i", True, 0), - "IFMDim": ("i", True, 0), - "OFMDim": ("i", True, 0), + "IFMDim": ("ints", True, []), # [H, W] = [Y, X] + "OFMDim": ("ints", True, []), # [H, W] = [Y, X] "SIMD": ("i", True, 0), - "Stride": ("i", True, 0), + "Stride": ("ints", True, [1, 1]), # [H, W] = [Y, X] + "Dilation": ("ints", True, [1, 1]), # [H, W] = [Y, X] # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "outputDataType": ("s", True, ""), @@ -87,43 +88,45 @@ class ConvolutionInputGenerator(HLSCustomOp): return my_attrs def get_normal_input_shape(self): - - ifm_dim = self.get_nodeattr("IFMDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") - - ishape = (1, ifm_dim, ifm_dim, ifm_ch) + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape def get_folded_input_shape(self): - ifm_dim = self.get_nodeattr("IFMDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim, ifm_dim, wf, simd) + folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape def get_normal_output_shape(self): - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") - stride = self.get_nodeattr("Stride") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") pad = 0 - ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad) - oshape = (1, ofm_dim, ofm_dim, k * k * ifm_ch) + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape def get_folded_output_shape(self): - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") - stride = self.get_nodeattr("Stride") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") simd = self.get_nodeattr("SIMD") pad = 0 - ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad) + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int((k * k * ifm_ch) // simd) - folded_oshape = (1, ofm_dim, ofm_dim, wf, simd) + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) return folded_oshape def make_shape_compatible_op(self, model): @@ -186,26 +189,31 @@ class ConvolutionInputGenerator(HLSCustomOp): def get_exp_cycles(self): simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - cycles_write_block = (ofm_dim * k * k * (ifm_ch / simd)) / mmv - cycles_read_block = stride * ifm_dim * (ifm_ch / simd) + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) max_cycles = max(cycles_write_block, cycles_read_block) - exp_cycles = ifm_dim * k * (ifm_ch / simd) + ofm_dim * max_cycles + exp_cycles = ( + ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + ) return int(exp_cycles) def bram_estimation(self): + # NOTE: only tested with a square convolution simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim") - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] ram_style = self.get_nodeattr("ram_style") if ram_style == "block" or ram_style == "auto": ram_depth = ifm_dim * ifm_ch / simd @@ -232,11 +240,12 @@ class ConvolutionInputGenerator(HLSCustomOp): return 0 def lut_estimation(self): + # NOTE: only tested with a square convolution simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim") - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] ram_style = self.get_nodeattr("ram_style") if ram_style == "distributed": ram_luts = int( @@ -252,11 +261,12 @@ class ConvolutionInputGenerator(HLSCustomOp): return 300 + ram_luts def uram_estimation(self): + # NOTE: only tested with a square convolution simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim") - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] ram_style = self.get_nodeattr("ram_style") if ram_style == "ultra": return int( @@ -295,7 +305,7 @@ class ConvolutionInputGenerator(HLSCustomOp): assert ( inp.shape == exp_ishape ), """Input shape doesn't - match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch).""" if self.get_input_datatype() == DataType.BIPOLAR: # store bipolar activations as binary inp = (inp + 1) / 2 @@ -354,25 +364,33 @@ class ConvolutionInputGenerator(HLSCustomOp): assert ( context[node.output[0]].shape == exp_oshape ), """Output - shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch).""" + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] def defines(self, var): numReps = 1 + ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_ch = self.get_nodeattr("IFMChannels") + ofm_dim = self.get_nodeattr("OFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + simd = self.get_nodeattr("SIMD") + ifm_precision = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] = [ """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n #define Input_precision1 {}\n #define IFMDim1 {}\n #define OFMDim1 {}\n #define SIMD1 {}\n #define Stride1 {}\n #define numReps {}""".format( - self.get_nodeattr("ConvKernelDim"), - self.get_nodeattr("IFMChannels"), - self.get_input_datatype().bitwidth(), - self.get_nodeattr("IFMDim"), - self.get_nodeattr("OFMDim"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("Stride"), + k, + ifm_ch, + ifm_precision, + ifm_dim, + ofm_dim, + simd, + stride, numReps, ) ] @@ -415,9 +433,11 @@ class ConvolutionInputGenerator(HLSCustomOp): } hls_ram_style = map_to_hls_ram_style[ram_style] hls_call = node.op_type - # check if non optimized ConvolutionInputGenerator is needed - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + + # check which ConvolutionInputGenerator is needed + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + if k % stride != 0: hls_call += "_kernel_stride" diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 4e0e8c7c35a8fc8a30e0ba4c27a7c0d637e24d1f..1ec12263e22a199ac7da55fdf3418185cd38e555 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -47,7 +47,9 @@ from finn.custom_op.registry import getCustomOp from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt): +def make_single_im2col_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt +): odt = idt inp = helper.make_tensor_value_info( "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch] @@ -61,12 +63,12 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i ["inp"], ["outp"], domain="finn.custom_op.general", - backend="fpgadataflow", - stride=stride, + stride=[stride, stride], kernel_size=[k, k], input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)), pad_amount=[0, 0, 0, 0], pad_value=0, + dilations=[dilation, dilation], ) graph = helper.make_graph( nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] @@ -82,7 +84,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0 + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 ): odt = idt inp = helper.make_tensor_value_info( @@ -98,12 +100,13 @@ def make_single_slidingwindow_modelwrapper( ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ConvKernelDim=k, + ConvKernelDim=[k, k], IFMChannels=ifm_ch, - IFMDim=ifm_dim, - OFMDim=ofm_dim, + IFMDim=[ifm_dim, ifm_dim], + OFMDim=[ofm_dim, ofm_dim], SIMD=simd, - Stride=stride, + Stride=[stride, stride], + Dilation=[dilation, dilation], inputDataType=idt.name, outputDataType=odt.name, depthwise=dw, @@ -138,6 +141,9 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride @pytest.mark.parametrize("stride", [1, 2]) +# Dilation +# Currently only dilation value of 1 is supported +@pytest.mark.parametrize("dilation", [1]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") @@ -147,13 +153,13 @@ def prepare_inputs(input_tensor): @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_slidingwindow( - idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw + idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw ): ofm_dim = int(((ifm_dim - k) / stride) + 1) x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) model = make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw ) if exec_mode == "cppsim": @@ -174,9 +180,10 @@ def test_fpgadataflow_slidingwindow( # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] golden = make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt ) y_expected = oxe.execute_onnx(golden, input_dict)["outp"] + if dw == 0: assert (y_produced == y_expected).all() else: