diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 5d7806701e6664b860167175448001a3d0b54a0a..fcd6f9d788d0af1cad6de5259e5e181e76ac96bc 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -15,7 +15,7 @@ gecho () { # the repos themselves are cloned in the Dockerfile BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=1893584c83dc4500fd92733d500e80903bab1d5d +HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716 PYVERILATOR_COMMIT=1d89cb0d4e0c97469cc6352c611f876ec13edfa6 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d diff --git a/run-docker.sh b/run-docker.sh index 186efc322a8f437be0371b5a142a9dd524d1abf3..e07556716db335421f57a390f1e6a17168ac058b 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -96,7 +96,7 @@ gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT" gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE" gecho "Using default PYNQ board $PYNQ_BOARD" -DOCKER_INTERACTIVE = "" +DOCKER_INTERACTIVE="" if [ "$1" = "test" ]; then gecho "Running test suite (all tests)" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 03119b3fbea12cf9065e561089ca5875a8f622b0..3e40ad70208909551365c51324153859ccc79ceb 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -41,10 +41,19 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # output 0 is the output tensor, shape NHWC: # = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels) +# note: the actual data layout produced by the hlslib kernels is different +# for depthwise and non-depthwise ops. +# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD) +# see test_fpgadataflow_slidingwindow.py for an example of how to transform +# between the two layouts + class ConvolutionInputGenerator(HLSCustomOp): - """Class that corresponds to finn-hlslib ConvolutionInputGenerator - (sliding window) function.""" + """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Depending on the combination of + attributes (e.g. depthwise or not, whether k % stride is 0) a different + variant will be picked for the actual HLS implementation.""" def __init__(self, onnx_node): super().__init__(onnx_node) @@ -60,6 +69,7 @@ class ConvolutionInputGenerator(HLSCustomOp): # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0), # FPGA resource type for ConvolutionInputGenerator input buffer # auto -- let Vivado HLS decide # block -- use BRAM @@ -106,7 +116,6 @@ class ConvolutionInputGenerator(HLSCustomOp): pad = 0 ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - assert k % stride == 0, "stride must divide kernel size k" wf = int((k * k * ifm_ch) // simd) folded_oshape = (1, ofm_dim, ofm_dim, wf, simd) return folded_oshape @@ -313,12 +322,27 @@ class ConvolutionInputGenerator(HLSCustomOp): "ultra": "ap_resource_uram()", } hls_ram_style = map_to_hls_ram_style[ram_style] - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, - OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format( - node.op_type, hls_ram_style - ) - ] + hls_call = node.op_type + # check if non optimized ConvolutionInputGenerator is needed + k = self.get_nodeattr("ConvKernelDim") + stride = self.get_nodeattr("Stride") + if k % stride != 0: + hls_call += "_kernel_stride" + + if self.get_nodeattr("depthwise") == 1: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}_dws<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, + OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, + OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py index 16446c15d46ee7996162f864708f7fde6cfedaf3..82a6b140f7af1be4e5c0f429d077b99c7865383e 100644 --- a/src/finn/custom_op/im2col.py +++ b/src/finn/custom_op/im2col.py @@ -21,8 +21,6 @@ def get_im2col_indices_nchw( """Returns im2col indices.""" # First figure out what the size of the output should be N, C, H, W = x_shape - assert (H + 2 * padding - field_height) % stride_y == 0 - assert (W + 2 * padding - field_width) % stride_x == 0 out_height = compute_conv_output_dim(H, field_height, stride_y, padding) out_width = compute_conv_output_dim(W, field_width, stride_x, padding) @@ -70,6 +68,9 @@ def im2col_indices_nchw( # * ifm is the number of input channels # * k is the convolutional kernel size +# note: for the innermost (dot product) dimension of k*k*ifm, we +# assume an internal ordering (k, k, ifm) + class Im2Col(CustomOp): def get_nodeattr_types(self): diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 5051bf34dc690daf8b6186859d3717cc8e217eee..b5fc85caf274edc9e7afc52df962862fa8a99ba3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -78,7 +78,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0 ): odt = idt inp = helper.make_tensor_value_info( @@ -102,6 +102,7 @@ def make_single_slidingwindow_modelwrapper( Stride=stride, inputDataType=idt.name, outputDataType=odt.name, + depthwise=dw, ) graph = helper.make_graph( nodes=[SlidingWindow_node], @@ -126,25 +127,29 @@ def prepare_inputs(input_tensor): # input datatype @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2]) # kernel size -@pytest.mark.parametrize("k", [2, 4]) +@pytest.mark.parametrize("k", [2, 3]) # input dimension -@pytest.mark.parametrize("ifm_dim", [4, 6, 8]) +@pytest.mark.parametrize("ifm_dim", [6, 8]) # input channels -@pytest.mark.parametrize("ifm_ch", [2, 4]) # , 2, 3, 4]) +@pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride @pytest.mark.parametrize("stride", [1, 2]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") @pytest.mark.parametrize("simd", [1, 2]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd): +def test_fpgadataflow_slidingwindow( + idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw +): ofm_dim = int(((ifm_dim - k) / stride) + 1) x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) model = make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw ) if exec_mode == "cppsim": @@ -168,6 +173,12 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt ) y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - # if idt == DataType.BIPOLAR: - # y_expected = 2 * y_expected - 1 - assert (y_produced == y_expected).all() + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) + assert (y_produced == y_expected).all()