diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index bfb09f86b8a493a44182450d09029de6486b8fbd..8117a251c4c92a0d19ce8f5fbeba6849a93f8e8f 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -12,11 +12,11 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -FINN_BASE_COMMIT=8908c6a3f6674c4fa790954bd41c23ee5bf053df +FINN_BASE_COMMIT=2c08044c5e9011c19911e731a18ac20d775bbf46 FINN_EXP_COMMIT=e9f97dcdb4db2f889b0f36af079a6a1792b7d4de BREVITAS_COMMIT=14abbe1e7ef82485d79415871fcf5766b0a40a00 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=2e49322d1bbc4969ca293843bda1f3f9c05456fc +HLSLIB_COMMIT=4d74baefa79df48b5a0348d63f39a26df075de51 PYVERILATOR_COMMIT=e2ff74030de3992dcac54bf1b6aad2915946e8cb OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 068950b89ae543f5a37c28d83d87ecfa605eaab4..b20b652254028c4ee5dc2edd1f1302ea3359019b 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -29,6 +29,9 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( ConvolutionInputGenerator, ) +from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( + ConvolutionInputGenerator1D, +) from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch @@ -49,6 +52,9 @@ from finn.custom_op.fpgadataflow.vector_vector_activate_batch import ( ) from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch from finn.custom_op.fpgadataflow.iodma import IODMA +from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( + StreamingDataflowPartition, +) custom_op = dict() @@ -58,6 +64,7 @@ custom_op["DownSampler"] = DownSampler custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["StreamingFCLayer_Batch"] = StreamingFCLayer_Batch custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator +custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D custom_op["TLastMarker"] = TLastMarker custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO @@ -71,3 +78,4 @@ custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch custom_op["Vector_Vector_Activate_Batch"] = Vector_Vector_Activate_Batch custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch custom_op["IODMA"] = IODMA +custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 3f400053df8de6ec1e53e39fb5a3edee15f3ab30..6e77cd3da7328fd81dccc2ff171a9ae84723d165 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -61,12 +61,14 @@ class ConvolutionInputGenerator(HLSCustomOp): def get_nodeattr_types(self): my_attrs = { - "ConvKernelDim": ("i", True, 0), + "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] "IFMChannels": ("i", True, 0), - "IFMDim": ("i", True, 0), - "OFMDim": ("i", True, 0), + "IFMDim": ("ints", True, []), # [H, W] = [Y, X] + "OFMDim": ("ints", True, []), # [H, W] = [Y, X] "SIMD": ("i", True, 0), - "Stride": ("i", True, 0), + "Stride": ("ints", True, [1, 1]), # [H, W] = [Y, X] + # note: only dilation=1 supported for now + "Dilation": ("ints", True, [1, 1]), # [H, W] = [Y, X] # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "outputDataType": ("s", True, ""), @@ -86,44 +88,59 @@ class ConvolutionInputGenerator(HLSCustomOp): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_normal_input_shape(self): + def get_nodeattr(self, name): + # overriding get_nodeattr to check for square kernel/img.. requirement + # since this can't be done with the attribute restriction in nodeattr_types + # TODO non-square can be enabled in theory but needs testing + ret = super().get_nodeattr(name) + props_to_check = ["ConvKernelDim", "IFMDim", "OFMDim", "Stride", "Dilation"] + if name in props_to_check: + is_square = ret[0] == ret[1] + assert is_square, "Only square %s supported" % name + if name == "Dilation": + assert ret[0] == ret[1] == 1, "Only dilation=1 supported" + return ret - ifm_dim = self.get_nodeattr("IFMDim") + def get_normal_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") - - ishape = (1, ifm_dim, ifm_dim, ifm_ch) + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) return ishape def get_folded_input_shape(self): - ifm_dim = self.get_nodeattr("IFMDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") simd = self.get_nodeattr("SIMD") assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim, ifm_dim, wf, simd) + folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) return folded_ishape def get_normal_output_shape(self): - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") - stride = self.get_nodeattr("Stride") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") pad = 0 - ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad) - oshape = (1, ofm_dim, ofm_dim, k * k * ifm_ch) + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) return oshape def get_folded_output_shape(self): - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") - stride = self.get_nodeattr("Stride") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") simd = self.get_nodeattr("SIMD") pad = 0 - ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad) + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int((k * k * ifm_ch) // simd) - folded_oshape = (1, ofm_dim, ofm_dim, wf, simd) + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) return folded_oshape def make_shape_compatible_op(self, model): @@ -186,26 +203,31 @@ class ConvolutionInputGenerator(HLSCustomOp): def get_exp_cycles(self): simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - cycles_write_block = (ofm_dim * k * k * (ifm_ch / simd)) / mmv - cycles_read_block = stride * ifm_dim * (ifm_ch / simd) + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) max_cycles = max(cycles_write_block, cycles_read_block) - exp_cycles = ifm_dim * k * (ifm_ch / simd) + ofm_dim * max_cycles + exp_cycles = ( + ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + ) return int(exp_cycles) def bram_estimation(self): + # NOTE: only tested with a square convolution simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim") - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] ram_style = self.get_nodeattr("ram_style") if ram_style == "block" or ram_style == "auto": ram_depth = ifm_dim * ifm_ch / simd @@ -232,11 +254,12 @@ class ConvolutionInputGenerator(HLSCustomOp): return 0 def lut_estimation(self): + # NOTE: only tested with a square convolution simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim") - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] ram_style = self.get_nodeattr("ram_style") if ram_style == "distributed": ram_luts = int( @@ -252,11 +275,12 @@ class ConvolutionInputGenerator(HLSCustomOp): return 300 + ram_luts def uram_estimation(self): + # NOTE: only tested with a square convolution simd = self.get_nodeattr("SIMD") ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim") - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] ram_style = self.get_nodeattr("ram_style") if ram_style == "ultra": return int( @@ -295,7 +319,7 @@ class ConvolutionInputGenerator(HLSCustomOp): assert ( inp.shape == exp_ishape ), """Input shape doesn't - match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch).""" if self.get_input_datatype() == DataType.BIPOLAR: # store bipolar activations as binary inp = (inp + 1) / 2 @@ -354,26 +378,27 @@ class ConvolutionInputGenerator(HLSCustomOp): assert ( context[node.output[0]].shape == exp_oshape ), """Output - shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch).""" + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] def defines(self, var): numReps = 1 + ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_ch = self.get_nodeattr("IFMChannels") + ofm_dim = self.get_nodeattr("OFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + simd = self.get_nodeattr("SIMD") + ifm_precision = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] = [ """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n #define Input_precision1 {}\n #define IFMDim1 {}\n #define OFMDim1 {}\n #define SIMD1 {}\n #define Stride1 {}\n #define numReps {}""".format( - self.get_nodeattr("ConvKernelDim"), - self.get_nodeattr("IFMChannels"), - self.get_input_datatype().bitwidth(), - self.get_nodeattr("IFMDim"), - self.get_nodeattr("OFMDim"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("Stride"), - numReps, + k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps ) ] @@ -415,9 +440,11 @@ class ConvolutionInputGenerator(HLSCustomOp): } hls_ram_style = map_to_hls_ram_style[ram_style] hls_call = node.op_type - # check if non optimized ConvolutionInputGenerator is needed - k = self.get_nodeattr("ConvKernelDim") - stride = self.get_nodeattr("Stride") + + # check which ConvolutionInputGenerator is needed + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + if k % stride != 0: hls_call += "_kernel_stride" diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py new file mode 100644 index 0000000000000000000000000000000000000000..782655b31b7f4add4c886a46845506af875190bc --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py @@ -0,0 +1,616 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os + +import math +import numpy as np + +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.general.im2col import compute_conv_output_dim +from onnx import TensorProto, helper +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# This operation should only be used for 1D convolutions. Either the +# IFMDim_H or IFMDim_W should be '1', which represents the so-called +# dummy-dimension + +# ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D: +# input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels) +# output 0 is the output tensor, shape NHWC: +# = (1, OFMDim_H, OFMDim_W, (ConvKernelDim_H*ConvKernelDim_W)*IFMChannels) + +# note: the actual data layout produced by the hlslib kernels is different +# for depthwise and non-depthwise ops. +# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD) +# see test_fpgadataflow_slidingwindow.py for an example of how to transform +# between the two layouts + + +class ConvolutionInputGenerator1D(HLSCustomOp): + """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Depending on the combination of + attributes (e.g. depthwise or not, whether dilation is 0) a different + variant will be picked for the actual HLS implementation.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] + "IFMChannels": ("i", True, 0), + "IFMDim": ("ints", True, []), # [H, W] = [Y, X] + "OFMDim": ("ints", True, []), # [H, W] = [Y, X] + "SIMD": ("i", True, 0), + "Stride": ("ints", True, []), # [H, W] = [Y, X] + "Dilation": ("ints", True, []), # [H, W] = [Y, X] + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0, {0, 1}), + # FPGA resource type for ConvolutionInputGenerator input buffer + # auto -- let Vivado HLS decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use URAM + "ram_style": ( + "s", + False, + "distributed", + {"auto", "block", "distributed", "ultra"}, + ), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + wf = int(ifm_ch / simd) + folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) + return folded_ishape + + def get_normal_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) + return oshape + + def get_folded_output_shape(self): + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ifm_ch = self.get_nodeattr("IFMChannels") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + pad = 0 + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + return folded_oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + """Returns stream width, input and output stream width are equal for + the sliding window function""" + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" + in_width = simd * ibits + return in_width + + def get_outstream_width(self): + """Returns stream width, input and output stream width are equal for + the sliding window function, so the function to determine the input + stream width can be reused.""" + return self.get_instream_width() + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + num_output_elems = np.prod(folded_oshape[:-1]) + return num_output_elems + + def get_1d_conv_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # For the kernel, presenting the input data of size D as + # [H, W] = [Y, X] = [1, D] or [D, 1] + # effectively gives the same result. Because the + # ConvolutionInputGenerator_NonSquare_Dilated(_dws) kernel currently only + # supports dilation>1 along the X-axis and the + # ConvolutionInputGenerator_NonSquare only works for stride>1 along the + # X-axis, we are working with the following assumption: + # the dummy ('1') dimension is the Y-dimension, i.e. + # images and kernels (and their attributes) of dimension + # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + # see defines() for an explanation + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + + def get_exp_cycles(self): + simd = self.get_nodeattr("SIMD") + ( + ifm_ch, + ifm_dim, + ofm_dim, + k, + stride, + dilation, + ) = self.get_1d_conv_attrs_normalized() + ifm_dim_h, ifm_dim_w = ifm_dim + ofm_dim_h, ofm_dim_w = ofm_dim + k_h, k_w = k + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ( + ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + ) + + return int(exp_cycles) + + def bram_estimation(self): + # NOTE: not tested for correctness + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = np.prod(self.get_nodeattr("IFMDim")) + k = np.prod(self.get_nodeattr("ConvKernelDim")) + stride = np.prod(self.get_nodeattr("Stride")) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "block" or ram_style == "auto": + ram_depth = ifm_dim * ifm_ch / simd + if ram_depth <= 512: + ram_width = 36 + elif ram_depth <= 1024: + ram_width = 18 + elif ram_depth <= 2048: + ram_width = 9 + elif ram_depth <= 4096: + ram_width = 4 + elif ram_depth <= 8192: + ram_width = 2 + else: + ram_width = 1 + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) + * math.ceil(ifm_dim * ifm_ch / simd / ram_depth) + ) + ) + else: + return 0 + + def lut_estimation(self): + # NOTE: not tested for correctness + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = np.prod(self.get_nodeattr("IFMDim")) + k = np.prod(self.get_nodeattr("ConvKernelDim")) + stride = np.prod(self.get_nodeattr("Stride")) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "distributed": + ram_luts = int( + (k + stride) + * ( + simd + * self.get_input_datatype().bitwidth() + * math.ceil(ifm_dim * ifm_ch / simd / 64) + ) + ) + else: + ram_luts = 0 + return 300 + ram_luts + + def uram_estimation(self): + # NOTE: not tested for correctness + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = np.prod(self.get_nodeattr("IFMDim")) + k = np.prod(self.get_nodeattr("ConvKernelDim")) + stride = np.prod(self.get_nodeattr("Stride")) + ram_style = self.get_nodeattr("ram_style") + if ram_style == "ultra": + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / 64) + * math.ceil(ifm_dim * ifm_ch / simd / 4096) + ) + ) + else: + return 0 + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + folded_oshape = self.get_folded_output_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType.BIPOLAR: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType.BINARY + else: + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim \ + did not produce expected ofolded utput shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType.BIPOLAR: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] + + def defines(self, var): + numReps = 1 + ( + ifm_ch, + ifm_dim, + ofm_dim, + k, + stride, + dilation, + ) = self.get_1d_conv_attrs_normalized() + simd = self.get_nodeattr("SIMD") + ifm_precision = self.get_input_datatype().bitwidth() + ifm_dim_y, ifm_dim_x = ifm_dim + ofm_dim_y, ofm_dim_x = ofm_dim + k_y, k_x = k + dilation_y, dilation_x = dilation + # For a 1d convolution with stride=[S,1] or [1,S], the finn-hlslib function + # of ConvInpGen must be created with [stride_y, stride_x] = [S, S]. + # TODO: changes in finn-hlslib (slidingwindow.h) + stride_y = np.prod(stride) + stride_x = np.prod(stride) + + if dilation_x > 1: + assert ( + dilation_y == 1 + ), "Dilation value greater than 1 along y-axis is not yet supported" + self.code_gen_dict["$DEFINES$"] = [ + """ + #define ConvKernelDim1_x {}\n + #define ConvKernelDim1_y {}\n + #define IFMChannels1 {}\n + #define Input_precision1 {}\n + #define IFMDim1_x {}\n + #define IFMDim1_y {}\n + #define OFMDim1_x {}\n + #define OFMDim1_y {}\n + #define SIMD1 {}\n + #define Stride1_x {}\n + #define Stride1_y {}\n + #define Dilation1_x {}\n + #define Dilation1_y {}\n + #define numReps {} + """.format( + k_x, + k_y, + ifm_ch, + ifm_precision, + ifm_dim_x, + ifm_dim_y, + ofm_dim_x, + ofm_dim_y, + simd, + stride_x, + stride_y, + dilation_x, + dilation_y, + numReps, + ) + ] + else: + ofm_dim = self.get_nodeattr("OFMDim") + self.code_gen_dict["$DEFINES$"] = [ + """ + #define ConvKernelDim1_x {}\n + #define ConvKernelDim1_y {}\n + #define IFMChannels1 {}\n + #define Input_precision1 {}\n + #define IFMDim1_x {}\n + #define IFMDim1_y {}\n + #define OFMDim1_x {}\n + #define OFMDim1_y {}\n + #define SIMD1 {}\n + #define Stride1_x {}\n + #define Stride1_y {}\n + #define numReps {} + """.format( + k_x, + k_y, + ifm_ch, + ifm_precision, + ifm_dim_x, + ifm_dim_y, + ofm_dim_x, + ofm_dim_y, + simd, + stride_x, + stride_y, + numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + ram_style = self.get_nodeattr("ram_style") + map_to_hls_ram_style = { + "auto": "ap_resource_dflt()", + "block": "ap_resource_bram()", + "distributed": "ap_resource_lutram()", + "ultra": "ap_resource_uram()", + } + hls_ram_style = map_to_hls_ram_style[ram_style] + hls_call = "ConvolutionInputGenerator" + # check which ConvolutionInputGenerator is needed + dilation_h, dilation_w = self.get_nodeattr("Dilation") + + hls_call += "_NonSquare" + if dilation_h > 1 or dilation_w > 1: + hls_call += "_Dilated" + if self.get_nodeattr("depthwise") == 1: + hls_call += "_dws" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, + IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y, + Dilation1_x, Dilation1_y> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + elif self.get_nodeattr("depthwise") == 1: + hls_call += "_dws" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, + IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y> + (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1_x, ConvKernelDim1_y, IFMChannels1, Input_precision1, + IFMDim1_x, IFMDim1_y, OFMDim1_x, OFMDim1_y, SIMD1, Stride1_x, Stride1_y> + (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<SIMD1*Input_precision1>> &in0, + hls::stream<ap_uint<SIMD1*Input_precision1>> &out)""".format( + self.onnx_node.name + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py index 27dfab54ec6d483d948dd383e54a44117d7c1a65..99f959bf59f06d6ab5b71dd7245d657f4964cca4 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -17,9 +17,17 @@ class FMPadding_Batch(HLSCustomOp): def get_nodeattr_types(self): my_attrs = { # spatial size of input images - "ImgDim": ("i", True, 0), + "ImgDim": ("ints", True, []), # [H, W] = [Y, X] # total padding (per dimension) to apply - "Padding": ("i", True, 2), + # NOTE: Current padding scheme that is applied tries to pad the same + # amount of zeros in front and behind the image for each dimension. + # As an example, a padding scheme such as [1, x, 3, x] is equal + # to [2, x, 2, x] + "Padding": ( + "ints", + True, + [1, 1, 1, 1], + ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] # number of channels in input image "NumChannels": ("i", True, 0), # SIMD Input parallelism @@ -38,31 +46,33 @@ class FMPadding_Batch(HLSCustomOp): def get_padded_odim(self): "Return the padded spatial size of the output." - - idim = self.get_nodeattr("ImgDim") + idim_h, idim_w = self.get_nodeattr("ImgDim") pad = self.get_nodeattr("Padding") - return idim + pad + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + odim_h = idim_h + pad_h + odim_w = idim_w + pad_w + return [odim_h, odim_w] def get_exp_cycles(self): - odim = self.get_padded_odim() + odim_h, odim_w = self.get_padded_odim() channels = self.get_nodeattr("NumChannels") simd = self.get_nodeattr("SIMD") batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = (channels / simd) * batch_size * odim * odim + exp_cycles = (channels / simd) * batch_size * odim_h * odim_w return int(exp_cycles) def get_normal_input_shape(self): - idim = self.get_nodeattr("ImgDim") + idim_h, idim_w = self.get_nodeattr("ImgDim") num_ch = self.get_nodeattr("NumChannels") - - ishape = (1, idim, idim, num_ch) + ishape = (1, idim_h, idim_w, num_ch) return ishape def get_normal_output_shape(self): - odim = self.get_padded_odim() + odim_h, odim_w = self.get_padded_odim() num_ch = self.get_nodeattr("NumChannels") - oshape = (1, odim, odim, num_ch) + oshape = (1, odim_h, odim_w, num_ch) return oshape def get_folded_input_shape(self): @@ -148,20 +158,53 @@ class FMPadding_Batch(HLSCustomOp): self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim1 {}\n#define OutputDim1 {}\n - #define Padding1 {}\n#define NumChannels1 {}\n - #define PaddingStyle1 {}\n#define numReps {} - #define SIMD1 {}\n""".format( - self.get_nodeattr("ImgDim"), - self.get_padded_odim(), - self.get_nodeattr("Padding"), - self.get_nodeattr("NumChannels"), - self.get_nodeattr("PaddingStyle"), - self.get_nodeattr("numInputVectors"), - self.get_nodeattr("SIMD"), - ) - ] + idim_h, idim_w = self.get_nodeattr("ImgDim") + odim_h, odim_w = self.get_padded_odim() + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + is_square = idim_h == idim_w + + if is_square: + assert ( + pad_h == pad_w + ), "Only equal padding along the dimensions for square images is supported" + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim1 {}\n#define OutputDim1 {}\n + #define Padding1 {}\n#define NumChannels1 {}\n + #define SIMD1 {}\n#define PaddingStyle1 {}\n + #define numReps {}\n""".format( + idim_h, + odim_h, + pad_h, + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("PaddingStyle"), + self.get_nodeattr("numInputVectors"), + ) + ] + else: + self.code_gen_dict["$DEFINES$"] = [ + """ + #define OutputDim1_x {}\n + #define OutputDim1_y {}\n + #define Padding1_x {}\n + #define Padding1_y {}\n + #define NumChannels1 {}\n + #define SIMD1 {}\n + #define PaddingStyle1 {}\n + #define numReps {}\n + """.format( + odim_w, + odim_h, + pad_w, + pad_h, + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("PaddingStyle"), + self.get_nodeattr("numInputVectors"), + ) + ] def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -193,12 +236,26 @@ class FMPadding_Batch(HLSCustomOp): def docompute(self): in_t = self.get_input_datatype().get_hls_datatype_str() node = self.onnx_node - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, - {}, PaddingStyle1> (in0, out, numReps);""".format( - node.op_type, in_t - ) - ] + + idim_h, idim_w = self.get_nodeattr("ImgDim") + is_square = idim_h == idim_w + + if is_square: + hls_call = node.op_type + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, + {}, PaddingStyle1> (in0, out, numReps);""".format( + hls_call, in_t + ) + ] + else: + hls_call = "FMPadding_nonsquare_Batch" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1, + SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format( + hls_call, in_t + ) + ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -270,7 +327,7 @@ class FMPadding_Batch(HLSCustomOp): assert ( inp.shape == exp_ishape ), """Input shape doesn't - match expected shape (1, ImgDim, ImgDim, NumChannels).""" + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" export_idt = self.get_input_datatype() reshaped_input = inp.reshape(folded_ishape) @@ -316,4 +373,4 @@ class FMPadding_Batch(HLSCustomOp): assert ( context[node.output[0]].shape == exp_oshape ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" + (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py index 2ab070b2fdc059a554930345a81abc368c29bfa7..c07188430244b635ab6b1ec192337da74550d57d 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py @@ -38,11 +38,11 @@ from finn.util.basic import ( roundup_to_integer_multiple, get_rtlsim_trace_depth, ) -from finn.util.fpgadataflow import ( - IPGenBuilder, +from finn.util.pyverilator import ( pyverilate_get_liveness_threshold_cycles, rtlsim_multi_io, ) +from finn.util.hls import CallHLS from . import templates try: @@ -310,11 +310,11 @@ class HLSCustomOp(CustomOp): return [] def ipgen_singlenode_code(self): - """Builds the bash script for ip generation using the IPGenBuilder from - finn.util.fpgadataflow.""" + """Builds the bash script for ip generation using the CallHLS from + finn.util.hls.""" node = self.onnx_node code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - builder = IPGenBuilder() + builder = CallHLS() builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name)) builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) builder.build(code_gen_dir) diff --git a/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py new file mode 100644 index 0000000000000000000000000000000000000000..53446ff1f2aba30e69bf188c1673c738440567fb --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingdataflowpartition.py @@ -0,0 +1,94 @@ +# Copyright (c) 2020 Xilinx, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.base import CustomOp + +# TODO move StreamingDataflowPartition to HLSCustomOp base class + + +class StreamingDataflowPartition(CustomOp): + """Class that corresponds to the meta/container node StreamingDataflowPartition + which is a placeholder for a group of fpgadataflow nodes that have been separated + out into a FINN-ONNX model of its own. Note that is does not produce any HLS or + bitfile by itself.""" + + def get_nodeattr_types(self): + return { + "model": ("s", True, ""), + "res_estimate": ("s", False, ""), + "res_hls": ("s", False, ""), + "res_synth": ("s", False, ""), + "slr": ("i", False, -1), + "partition_id": ("i", False, 0), + "device_id": ("i", False, 0), + "mem_port": ("s", False, ""), + } + + def make_shape_compatible_op(self, model): + pass + + def infer_node_datatype(self, model): + pass + + def execute_node(self, context, graph): + # TODO add RPC execution with synthesized bitfile? + # whole-design rtlsim with PyVerilator may also be an alternative + pass + + def verify_node(self): + info_messages = [] + + # verify number of attributes + num_of_attr = 1 + if len(self.onnx_node.attribute) == num_of_attr: + info_messages.append("The number of attributes is correct") + else: + info_messages.append( + """The number of attributes is incorrect, + {} should have {} attributes""".format( + self.onnx_node.op_type, num_of_attr + ) + ) + # verify that all necessary attributes exist + try: + self.get_nodeattr("model") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The necessary attributes do not exist. + StreamingDataflowPartition needs the following attribute(s): + model""" + ) + + # verify the number of inputs + if len(self.onnx_node.input) >= 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("StreamingDataflowPartition needs 1 data input") + + return info_messages diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py index 9a897d9fa16064017dfc02f500d2360ae8431b4a..fead30650c60d38f9cd70de8f1515f847e15276f 100644 --- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py +++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py @@ -26,9 +26,9 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): def get_nodeattr_types(self): my_attrs = { "PE": ("i", True, 0), - "Dim": ("i", True, 0), + "Dim": ("ints", True, []), # [H, W] "Channels": ("i", True, 0), - "Kernel": ("i", True, 0), + "Kernel": ("ints", True, []), # [H, W] "resType": ("s", False, "auto", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs @@ -45,10 +45,10 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): def minimize_accumulator_width(self, model): weights = model.get_initializer(self.onnx_node.input[1]) - k = self.get_nodeattr("Kernel") + k_h, k_w = self.get_nodeattr("Kernel") fm = self.get_nodeattr("Channels") # put weights into the shape expected by calculate_matvec_accumulator_range - weights = weights.reshape(fm, k * k).transpose() + weights = weights.reshape(fm, k_h * k_w).transpose() if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) else: @@ -85,9 +85,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): tdt = DataType.get_smallest_possible(0 - tdt_max) else: tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)(threshold_tensor).all(), ( - "Thresholds in %s can't be expressed with type %s" - % (self.onnx_node.name, str(tdt)) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(tdt), ) self.set_nodeattr("accDataType", tdt.name) else: @@ -110,9 +112,9 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): def calc_wmem(self): """Calculates and returns WMEM.""" ch = self.get_nodeattr("Channels") - k = self.get_nodeattr("Kernel") + k_h, k_w = self.get_nodeattr("Kernel") pe = self.get_nodeattr("PE") - wmem = k * k * ch // pe + wmem = k_h * k_w * ch // pe return wmem def calc_tmem(self): @@ -181,34 +183,34 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): return out_width def get_folded_input_shape(self): - k = self.get_nodeattr("Kernel") - sf = k * k - dim = self.get_nodeattr("Dim") + k_h, k_w = self.get_nodeattr("Kernel") + sf = k_h * k_w + dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe - folded_input_shape = tuple([1, dim, dim, sf * nf, pe]) + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) return folded_input_shape def get_folded_output_shape(self): ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") nf = ch // pe - dim = self.get_nodeattr("Dim") - folded_output_shape = tuple([1, dim, dim, nf, pe]) + dim_h, dim_w = self.get_nodeattr("Dim") + folded_output_shape = tuple([1, dim_h, dim_w, nf, pe]) return folded_output_shape def get_normal_input_shape(self): - dim = self.get_nodeattr("Dim") + dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") - k = self.get_nodeattr("Kernel") - normal_input_shape = tuple([1, dim, dim, k * k * ch]) + k_h, k_w = self.get_nodeattr("Kernel") + normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch]) return normal_input_shape def get_normal_output_shape(self): ch = self.get_nodeattr("Channels") - dim = self.get_nodeattr("Dim") - normal_output_shape = tuple([1, dim, dim, ch]) + dim_h, dim_w = self.get_nodeattr("Dim") + normal_output_shape = tuple([1, dim_h, dim_w, ch]) return normal_output_shape def get_number_output_values(self): @@ -218,13 +220,13 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): def get_exp_cycles(self): pe = self.get_nodeattr("PE") ch = self.get_nodeattr("Channels") - dim = self.get_nodeattr("Dim") - k = self.get_nodeattr("Kernel") + dim_h, dim_w = self.get_nodeattr("Dim") + k_h, k_w = self.get_nodeattr("Kernel") # currently FINN supports for vvau a batch size of 1 batch_size = 1 # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 - exp_cycles = ((ch * k * k) / pe) * batch_size * (dim * dim) / mmv + exp_cycles = ((ch * k_h * k_w) / pe) * batch_size * (dim_h * dim_w) / mmv return int(exp_cycles) def get_template_param_values(self): @@ -251,17 +253,17 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): def get_hls_compatible_weight_tensor(self, orig_weight_matrix): pe = self.get_nodeattr("PE") ch = self.get_nodeattr("Channels") - k = self.get_nodeattr("Kernel") + k_h, k_w = self.get_nodeattr("Kernel") wmem = self.calc_wmem() assert orig_weight_matrix.shape == ( ch, 1, - k, - k, + k_h, + k_w, ), """Weights matrix doesn't have expected shape (channels, 1, kernel_size, kernel_size)""" ret = orig_weight_matrix - ret = ret.reshape(ch, k * k) + ret = ret.reshape(ch, k_h * k_w) # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) ret = ret.reshape(1, pe, wmem, 1) @@ -338,9 +340,11 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) # get computed threshold datatype from attribute tdt = DataType[self.get_nodeattr("accDataType")] - assert np.vectorize(tdt.allowed)(threshold_tensor).all(), ( - "Thresholds in %s can't be expressed with type %s" - % (self.onnx_node.name, str(tdt)) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(tdt), ) thresholds_hls_code = numpy_to_hls_code( threshold_tensor, tdt, "thresholds", False, True @@ -455,10 +459,10 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] def defines(self, var): - dim = self.get_nodeattr("Dim") - numReps = 1 * dim * dim - kernel = self.get_nodeattr("Kernel") - innerProdDim = kernel * kernel + dim_h, dim_w = self.get_nodeattr("Dim") + numReps = 1 * dim_h * dim_w + k_h, k_w = self.get_nodeattr("Kernel") + innerProdDim = k_h * k_w self.code_gen_dict["$DEFINES$"] = [ """#define Channels1 {}\n #define InnerProdDim {}\n #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format( @@ -664,8 +668,8 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): else: mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A) # accumulator - k = self.get_nodeattr("Kernel") - acc_bits = W + A + math.ceil(math.log(k * k, 2)) + k_h, k_w = self.get_nodeattr("Kernel") + acc_bits = W + A + math.ceil(math.log(k_h * k_w, 2)) acc_luts = acc_bits # thresholds and threshold comparators thr_luts = 0 @@ -694,20 +698,20 @@ class Vector_Vector_Activate_Batch(HLSCustomOp): return int(mult_dsp) def get_op_and_param_counts(self): - k = self.get_nodeattr("Kernel") + k_h, k_w = self.get_nodeattr("Kernel") fm = self.get_nodeattr("Channels") - dim = self.get_nodeattr("Dim") + dim_h, dim_w = self.get_nodeattr("Dim") weight_bits = self.get_weight_datatype().bitwidth() inp_bits = self.get_input_datatype().bitwidth() - num_repetitions = int(dim * dim) - mac_count = k * k * fm * num_repetitions + num_repetitions = int(dim_h * dim_w) + mac_count = k_h * k_w * fm * num_repetitions # cannonicalize op type: highest bitwidth operand first s.t. # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types bw1 = min(inp_bits, weight_bits) bw2 = max(inp_bits, weight_bits) mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = k * k * fm + weight_count = k_h * k_w * fm ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} if self.get_nodeattr("noActivation") == 0: tdt = DataType[self.get_nodeattr("accDataType")] diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index a221b510ab8d22f4daca1c32e717a9b482246712..1f3d40e929e29d16790a491bbfd7a4a5033f866f 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -64,30 +64,28 @@ class InferConvInpGen(Transformation): warnings.warn("Input is not int. Can't infer ConvInpGen") continue i2c_inst = getCustomOp(n) - stride = i2c_inst.get_nodeattr("stride") - k_attr = i2c_inst.get_nodeattr("kernel_size") - k_h = k_attr[0] - k_w = k_attr[1] + stride_h, stride_w = i2c_inst.get_nodeattr("stride") + k_h, k_w = i2c_inst.get_nodeattr("kernel_size") pad_attr = i2c_inst.get_nodeattr("pad_amount") pad_h = pad_attr[0] + pad_attr[2] pad_w = pad_attr[1] + pad_attr[3] + dilation_h, dilation_w = i2c_inst.get_nodeattr("dilations") # temporary checks until non-square conv support is finalized - assert pad_h == pad_w, "Non-square images not yet supported." - assert k_h == k_w, "Non-square kernels not yet supported." - k = k_h - pad = pad_attr[0] pad_val = i2c_inst.get_nodeattr("pad_value") depthwise = i2c_inst.get_nodeattr("depthwise") ifm_ch = i2c_in_shape[-1] - ifm_dim = i2c_in_shape[1] - ofm_dim = i2c_out_shape[1] + ifm_dim_h = i2c_in_shape[1] + ifm_dim_w = i2c_in_shape[2] + ofm_dim_h = i2c_out_shape[1] + ofm_dim_w = i2c_out_shape[2] # default params for ConvolutionInputGenerator ConvInpGen_node_idx = node_ind ConvInpGen_input = i2c_input - ConvInpGen_idim = ifm_dim + ConvInpGen_idim_h = ifm_dim_h + ConvInpGen_idim_w = ifm_dim_w - if pad > 0: + if pad_h > 0 or pad_w > 0: # if padding enabled, ensure pad_val supported by DataType # assert dt.allowed(pad_val),"""FMPadding_Batch DataType # must support pad_val""" @@ -95,12 +93,13 @@ class InferConvInpGen(Transformation): pad_val == 0 ), "FMPadding_Batch doesn't currently support pad_val!= 0" - odim_padding = ifm_dim + 2 * pad + odim_padding_h = ifm_dim_h + pad_h + odim_padding_w = ifm_dim_w + pad_w padding_out = helper.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, - (1, odim_padding, odim_padding, ifm_ch), + (1, odim_padding_h, odim_padding_w, ifm_ch), ) graph.value_info.append(padding_out) padding_out = padding_out.name @@ -108,7 +107,8 @@ class InferConvInpGen(Transformation): ConvInpGen_node_idx += 1 ConvInpGen_input = padding_out - ConvInpGen_idim = odim_padding + ConvInpGen_idim_h = odim_padding_h + ConvInpGen_idim_w = odim_padding_w padding_node = helper.make_node( "FMPadding_Batch", @@ -116,15 +116,31 @@ class InferConvInpGen(Transformation): [padding_out], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ImgDim=ifm_dim, - Padding=2 * pad, + ImgDim=[ifm_dim_h, ifm_dim_w], + Padding=pad_attr, NumChannels=ifm_ch, inputDataType=dt.name, SIMD=ifm_ch, ) graph.node.insert(node_ind, padding_node) - if stride > 1 and k == 1: + # Ensure that only supported HLS nodes are inserted + is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w + is_square_kernel = k_h == k_w + is_kernel_pointwise = k_h == 1 and k_w == 1 + is_equal_stride = stride_h == stride_w + is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( + k_h > 1 and k_w == 1 and ifm_dim_w == 1 + ) + + if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: + assert ( + is_square_image + ), "DownSampler currently only supports square input images." + assert is_equal_stride, """DownSampler currently only supports equal stride value + along different axes.""" + ConvInpGen_idim = ConvInpGen_idim_h + stride = stride_h # create DownSampler node ConvInpGen_node = helper.make_node( "DownSampler", @@ -141,22 +157,58 @@ class InferConvInpGen(Transformation): graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) else: # create equivalent ConvolutionInputGenerator node - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=k, - IFMChannels=ifm_ch, - IFMDim=ConvInpGen_idim, - OFMDim=ofm_dim, - SIMD=ifm_ch, - Stride=stride, - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - ) + if ( + is_square_image and is_square_kernel + ): # square images and square kernels + assert is_equal_stride, """Non-equal strides along different axes is not supported + for (non-)square convolutions""" + assert ( + dilation_h == 1 and dilation_w == 1 + ), """Dilation value != 1 is not supported + for square convolutions""" + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + ) + else: # non-square images and/or kernels + assert ( + is_1d_convolution + ), "ConvultionInputGenerator1D works only for 1D convolutions" + if dilation_h > 1 or dilation_w > 1: + assert ( + stride_h == 1 and stride_w == 1 + ), """Stride value of greater than 1 is not supported for convolutions + with dilation value greater than 1""" + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator1D", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) @@ -338,7 +390,7 @@ class InferPool_Batch(Transformation): [im2col_in], [im2col_out], domain="finn.custom_op.general", - stride=stride, + stride=[stride, stride], kernel_size=[k, k], pad_amount=[pad, pad, pad, pad], pad_value=pad_value, @@ -684,7 +736,7 @@ class InferVVAU(Transformation): ): sparsity = model.get_tensor_sparsity(n.input[1]) try: - k = sparsity["dw"]["kernel_shape"] + k_h, k_w = sparsity["dw"]["kernel_shape"] except KeyError: raise Exception( """Sparsity doesn't indicate that MatMul @@ -702,25 +754,25 @@ class InferVVAU(Transformation): mm_output = n.output[0] W = model.get_initializer(mm_weight) # infer dense weight tensor from sparse weight matrix - # kernel size k which was extracted above and the value of + # kernel size (k_h, k_w) which was extracted above and the value of # the channels is used. - # the weight matrix has a shape of (k * k * Channels, Channels) + # the weight matrix has a shape of (k_h * k_w * Channels, Channels) # we need to reverse the creation of the sparse weight matrix - # to achieve a weight tensor of shape (Channels, 1, k, k) + # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) channels = int(W.shape[1]) - # transpose to achieve a shape of (k * k * Channels, Channels) + # transpose to achieve a shape of (k_h * k_w * Channels, Channels) W = W.T - # reshape to (Channels, k, k, Channels) to transpose afterwards - # to (Channels, Channels, k, k) - W = W.reshape(channels, k, k, channels) + # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards + # to (Channels, Channels, k_h, k_w) + W = W.reshape(channels, k_h, k_w, channels) W = W.transpose(0, 3, 1, 2) # now we can extract the values using a for loop over the channels # and fill a zero numpy array in the correct shape - w_tensor = np.zeros((channels, 1, k, k)) + w_tensor = np.zeros((channels, 1, k_h, k_w)) for ch in range(channels): w_tensor[ch][0] = W[ch][ch] model.set_initializer(mm_weight, w_tensor) - model.set_tensor_shape(mm_weight, (channels, 1, k, k)) + model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) # create node with pe=channels as default pe = channels assert ( @@ -762,9 +814,9 @@ class InferVVAU(Transformation): backend="fpgadataflow", resType="lut", PE=pe, - Dim=mm_in_shape[1], + Dim=[mm_in_shape[1], mm_in_shape[2]], Channels=channels, - Kernel=k, + Kernel=[k_h, k_w], inputDataType=idt.name, weightDataType=wdt.name, outputDataType=odt.name, @@ -790,9 +842,9 @@ class InferVVAU(Transformation): backend="fpgadataflow", resType="lut", PE=pe, - Dim=mm_in_shape[1], + Dim=[mm_in_shape[1], mm_in_shape[2]], Channels=channels, - Kernel=k, + Kernel=[k_h, k_w], inputDataType=idt.name, weightDataType=wdt.name, outputDataType=odt.name, @@ -1345,7 +1397,11 @@ class InferGlobalAccPoolLayer(Transformation): ) model.graph.value_info.append(mul_value) model.set_initializer(mul_value.name, np.array(1 / (vecs[1] * vecs[2]))) - new_mul = helper.make_node("Mul", [pool_out, mul_value.name], [result],) + new_mul = helper.make_node( + "Mul", + [pool_out, mul_value.name], + [result], + ) graph.node.insert(insert_point, new_pool) graph.node.insert(insert_point + 1, new_mul) node_ind += 1 diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py index 56bfb4306e555c716a9156d6f0949c339193eb38..419a6d8c494651862f55e63e6829a61fe8040599 100644 --- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py +++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py @@ -148,7 +148,7 @@ class CreateDataflowPartition(Transformation): [df_out], # use the model attribute to mark the df model model=df_model_filename, - domain="finn.custom_op.general", + domain="finn.custom_op.fpgadataflow", partition_id=target_partition_id, slr=slr, mem_port=mem_port, diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index f7d59978d8f8866aefb3028d570bb6b434df33b4..ea27eee04db6f90b50a58296ceaf6f6ed58602ac 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -39,8 +39,8 @@ from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames -from finn.util.fpgadataflow import pyverilate_stitched_ip, is_fpgadataflow_node -from finn.util.pyverilator import reset_rtlsim, toggle_clk +from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.pyverilator import pyverilate_stitched_ip, reset_rtlsim, toggle_clk def reset_implementation(node): diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..dfdb21fa72cbbeeb503f7ecc447b659ef7934fb9 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py @@ -0,0 +1,189 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from onnx import TensorProto, helper +import numpy as np +import pytest + +from finn.core.datatype import DataType +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul + +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +import finn.core.onnx_exec as oxe +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import gen_finn_dt_tensor +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls + +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.custom_op.general.im2col import compute_conv_output_dim +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer + + +# conv_config: +# [pad_h_begin, pad_w_begin, pad_h_end, pad_w_end] +# [kernel_size_h, kernel_size_w] +# [stride_h, stride_w] +# [dilation_h, dilation_w] +@pytest.mark.parametrize( + "conv_config", + [ + [[0, 0, 0, 0], [4, 1], [1, 1], [1, 1]], + [[1, 0, 1, 0], [4, 1], [1, 1], [1, 1]], + [[1, 0, 1, 0], [4, 1], [2, 1], [1, 1]], + # [[1, 0, 1, 0], [4, 1], [1, 1], [2, 1]] + ], +) +@pytest.mark.parametrize("depthwise", [False, True]) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.slow +@pytest.mark.vivado +def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode): + pad, kernel_size, stride, dilation = conv_config + np.random.seed(0) + idt = DataType.UINT4 + + in_feature_dim_h, in_feature_dim_w = [10, 1] + in_chn = 16 + + k_h, k_w = kernel_size + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + + if depthwise is True: + group = out_chn = in_chn + conv_param_shape = [out_chn, 1, k_h, k_w] + else: + group = 1 + out_chn = 20 + conv_param_shape = [out_chn, in_chn, k_h, k_w] + + out_feature_dim_h = compute_conv_output_dim( + in_feature_dim_h, k_h, stride_h, pad_h, dilation_h + ) + out_feature_dim_w = compute_conv_output_dim( + in_feature_dim_w, k_w, stride_w, pad_w, dilation_w + ) + + input_shape = [1, in_chn, in_feature_dim_h, in_feature_dim_w] + output_shape = [1, out_chn, out_feature_dim_h, out_feature_dim_w] + + conv_weight_dt = DataType.UINT4 + + conv_config = {} + conv_config["dilations"] = [dilation_h, dilation_w] + conv_config["group"] = group + conv_config["kernel_shape"] = [k_h, k_w] + conv_config["pads"] = pad + conv_config["strides"] = [stride_h, stride_w] + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + value_info = [ + helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape) + ] + + modelproto = helper.make_model( + helper.make_graph( + name="conv_test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config) + ], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", idt) + model.set_tensor_datatype("top_out", idt) + model.set_tensor_datatype("p1", conv_weight_dt) + model.set_initializer("p1", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape)) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + new_model = model.transform(LowerConvsToMatMul()) + new_model = new_model.transform(to_hls.InferConvInpGen()) + if depthwise is True: + new_model = new_model.transform(to_hls.InferVVAU()) + else: + new_model = new_model.transform(to_hls.InferQuantizedStreamingFCLayer()) + fc_node = new_model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0] + fc_inst = getCustomOp(fc_node) + mw = fc_inst.get_nodeattr("MW") + mh = fc_inst.get_nodeattr("MH") + pe_cands = list(filter(lambda x: mh % x == 0, range(2, mh + 1))) + simd_cands = list(filter(lambda x: mw % x == 0, range(2, mw + 1))) + fc_inst.set_nodeattr("PE", pe_cands[0]) + fc_inst.set_nodeattr("SIMD", simd_cands[0]) + + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(InferShapes()) + new_model = new_model.transform(InferDataTypes()) + + if exec_mode == "cppsim": + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + new_model = new_model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + x = gen_finn_dt_tensor(idt, input_shape) + inp_dict = {model.graph.input[0].name: x} + assert oxe.compare_execution(model, new_model, inp_dict) + + if pad_h == 1 and pad_w == 1: + padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_inst = getCustomOp(padding_node) + assert padding_inst.get_nodeattr("SIMD") == in_chn + + if depthwise is True and exec_mode == "rtlsim": + node = new_model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=11) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index c406d78158c52226fea881c48bc178139653fea5..3efeacb6e6875c6defa799eb7154e02ce880e16a 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -98,7 +98,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): inputs=["inp"], outputs=["im2col_out"], kernel_size=[k, k], - stride=stride, + stride=[stride, stride], pad_amount=[padding, padding, padding, padding], input_shape="(1, {}, {}, {})".format(ifm_dim, ifm_dim, ifm_ch), depthwise=1, @@ -142,7 +142,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): W_matrix = W_matrix.reshape(ofm_ch, ifm_ch * k * k) model.set_initializer("W_sparse", W_matrix.T) - sparsity = {"dw": {"kernel_shape": k}} + sparsity = {"dw": {"kernel_shape": [k, k]}} model.set_tensor_sparsity("W_sparse", sparsity) if act is not None: diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 4e0e8c7c35a8fc8a30e0ba4c27a7c0d637e24d1f..1ec12263e22a199ac7da55fdf3418185cd38e555 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -47,7 +47,9 @@ from finn.custom_op.registry import getCustomOp from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt): +def make_single_im2col_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt +): odt = idt inp = helper.make_tensor_value_info( "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch] @@ -61,12 +63,12 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i ["inp"], ["outp"], domain="finn.custom_op.general", - backend="fpgadataflow", - stride=stride, + stride=[stride, stride], kernel_size=[k, k], input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)), pad_amount=[0, 0, 0, 0], pad_value=0, + dilations=[dilation, dilation], ) graph = helper.make_graph( nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] @@ -82,7 +84,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0 + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 ): odt = idt inp = helper.make_tensor_value_info( @@ -98,12 +100,13 @@ def make_single_slidingwindow_modelwrapper( ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - ConvKernelDim=k, + ConvKernelDim=[k, k], IFMChannels=ifm_ch, - IFMDim=ifm_dim, - OFMDim=ofm_dim, + IFMDim=[ifm_dim, ifm_dim], + OFMDim=[ofm_dim, ofm_dim], SIMD=simd, - Stride=stride, + Stride=[stride, stride], + Dilation=[dilation, dilation], inputDataType=idt.name, outputDataType=odt.name, depthwise=dw, @@ -138,6 +141,9 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride @pytest.mark.parametrize("stride", [1, 2]) +# Dilation +# Currently only dilation value of 1 is supported +@pytest.mark.parametrize("dilation", [1]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") @@ -147,13 +153,13 @@ def prepare_inputs(input_tensor): @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_slidingwindow( - idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw + idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw ): ofm_dim = int(((ifm_dim - k) / stride) + 1) x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) model = make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw ) if exec_mode == "cppsim": @@ -174,9 +180,10 @@ def test_fpgadataflow_slidingwindow( # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] golden = make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt ) y_expected = oxe.execute_onnx(golden, input_dict)["outp"] + if dw == 0: assert (y_produced == y_expected).all() else: diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py new file mode 100644 index 0000000000000000000000000000000000000000..6c83aab0d683cdb3888aca3c46bb339bd6330917 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py @@ -0,0 +1,256 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +import numpy as np + +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.general import GiveUniqueNodeNames +from finn.util.basic import gen_finn_dt_tensor + +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.custom_op.general.im2col import compute_conv_output_dim + + +def make_single_im2col_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + im2col_node = helper.make_node( + "Im2Col", + ["inp"], + ["outp"], + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], + pad_amount=[0, 0, 0, 0], + pad_value=0, + ) + graph = helper.make_graph( + nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="im2col-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def make_single_slidingwindow_modelwrapper( + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + SlidingWindow_node = helper.make_node( + "ConvolutionInputGenerator1D", + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ifm_dim_h, ifm_dim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=simd, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=idt.name, + outputDataType=odt.name, + depthwise=dw, + ) + graph = helper.make_graph( + nodes=[SlidingWindow_node], + name="slidingwindow_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="slidingwindow-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +# @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT8]) +@pytest.mark.parametrize("idt", [DataType.INT8]) +# kernel size +@pytest.mark.parametrize("k", [[4, 1]]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [[10, 1]]) +# input channels +@pytest.mark.parametrize("ifm_ch", [1, 4]) +# Stride +@pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) +# Dilation +# @pytest.mark.parametrize("dilation", [[1, 1], [2, 1]]) +@pytest.mark.parametrize("dilation", [[1, 1]]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# input channel parallelism ("SIMD") +@pytest.mark.parametrize("simd", [1, 4]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False, True]) +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_slidingwindow_1d( + idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, flip +): + if flip: + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1): + pytest.skip( + """Dilation value greater than 1 and stride greater than 1 + currently not supported for 1D convolutions""" + ) + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + model = make_single_slidingwindow_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ) + + if exec_mode == "cppsim": + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") + + # prepare input data + input_dict = prepare_inputs(x) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + golden = make_single_im2col_modelwrapper( + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + stride=stride, + dilation=dilation, + idt=idt, + ) + y_expected = oxe.execute_onnx(golden, input_dict)["outp"] + + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) + assert (y_produced == y_expected).all() + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index b2835d578b03ee689330d53a9a7b233c9b9f4222..ab47b300136bd95622f064b30e3bbaae76a61597 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -54,15 +54,20 @@ target_clk_ns = 10 def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style): + pad_h = padding[0] + padding[2] + pad_w = padding[1] + padding[3] + idim_h, idim_w = idim + assert pad_style == 2, "only pad_style == 2 supported in hlslib" - assert padding > 0, "Output dim should be greater than input dim" - odim = idim + padding + assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim" + odim_h = idim_h + pad_h + odim_w = idim_w + pad_w inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, idim, idim, num_ch] + "inp", TensorProto.FLOAT, [1, idim_h, idim_w, num_ch] ) outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, odim, odim, num_ch] + "outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch] ) FMPadding = helper.make_node( @@ -94,9 +99,9 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty # input image dimension -@pytest.mark.parametrize("idim", [8]) +@pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) # number of rows and number of cols to add -@pytest.mark.parametrize("pad", [2, 3]) +@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]]) # number of channels @pytest.mark.parametrize("num_ch", [2, 4]) # Input parallelism @@ -112,10 +117,22 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): if num_ch % simd != 0: pytest.skip(" num_ch % simd != 0, skipping") + + idim_h, idim_w = idim + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + + if idim_h == idim_w and pad_h != pad_w: + pytest.skip( + """Only equal padding along the dimensions for square images + is supported, skipping""" + ) + # generate input data - x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) + x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch]) input_dict = {"inp": x} - odim = idim + pad + odim_h = idim_h + pad_h + odim_w = idim_w + pad_w model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style) model = model.transform(InferShapes()) @@ -129,24 +146,26 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) y_produced = oxe.execute_onnx(model, input_dict)["outp"] - expected_oshape = (1, odim, odim, num_ch) + expected_oshape = (1, odim_h, odim_w, num_ch) assert y_produced.shape == expected_oshape # calculate reference # calculate correct pad according to parameters if pad_style == 2: - if pad % 2 == 0: - pad_up = pad // 2 - pad_left = pad // 2 + if pad_h % 2 == 0: + pad_up = pad_h // 2 + else: + pad_up = pad_h // 2 + 1 + if pad_w % 2 == 0: + pad_left = pad_w // 2 else: - pad_up = pad // 2 + 1 - pad_left = pad // 2 + 1 + pad_left = pad_w // 2 + 1 else: - pad_up = pad // 2 - pad_left = pad // 2 + pad_up = pad_h // 2 + pad_left = pad_w // 2 - pad_down = pad - pad_up - pad_right = pad - pad_left + pad_down = pad_h - pad_up + pad_right = pad_w - pad_left y_expected = np.pad( x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant" diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 23d7610dfdf434602f326e1117b072f312962295..4fa780548a544d92e02b28486ae1e325ff1f9a9b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -52,7 +52,7 @@ from finn.util.basic import ( alveo_part_map, alveo_default_platform, ) -from finn.util.fpgadataflow import pyverilate_stitched_ip +from finn.util.pyverilator import pyverilate_stitched_ip from finn.util.test import load_test_checkpoint_or_skip from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext from finn.transformation.infer_data_layouts import InferDataLayouts diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py new file mode 100644 index 0000000000000000000000000000000000000000..4756d4fe18ccd4934b4041c70bf2f3a1bb577ec7 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -0,0 +1,242 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.general import GiveUniqueNodeNames +from finn.custom_op.general.multithreshold import multithreshold + +from finn.custom_op.registry import getCustomOp +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer + + +def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels): + W_sparse = np.zeros((channels, channels, k_h, k_w)) + for ch in range(channels): + W_sparse[ch][ch] = W_conv[ch][0] + W_conv = W_sparse.astype(np.float32) + W_matmul = W_conv.transpose(0, 2, 3, 1) + W_matmul = W_matmul.reshape(channels, channels * k_h * k_w) + W_matmul = W_matmul.T + + return W_matmul + + +def _calculate_dot_prod_range(dt_a, dt_b, len): + """Returns the (min,max) values a dot product between two (un)signed vectors of + types dt_a and dt_b of len elements can take.""" + min_prod = 2 ** 30 + max_prod = -(2 ** 30) + for a_val in [dt_a.min(), dt_a.max()]: + for b_val in [dt_b.min(), dt_b.max()]: + prod = a_val * b_val * len + if prod < min_prod: + min_prod = prod + if prod > max_prod: + max_prod = prod + return (min_prod, max_prod) + + +def _make_single_vvau_modelwrapper( + W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None +): + in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH] + out_shape = [ + 1, + dim_h, + dim_w, + channels, + ] # [N, H, W, OFM_CH] (OFM_CH=IFM_CH because depthwise convolution) + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, in_shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_shape) + + if T is not None: + no_act = 0 + node_inp_list = ["inp", "weights", "thresh"] + actval = odt.min() + else: + no_act = 1 + node_inp_list = ["inp", "weights"] + actval = 0 + + VVAU_node = helper.make_node( + "Vector_Vector_Activate_Batch", + node_inp_list, + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + PE=pe, + Dim=[dim_h, dim_w], + Channels=channels, + Kernel=[k_h, k_w], + resType="lut", + ActVal=actval, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + noActivation=no_act, + ) + + graph = helper.make_graph( + nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="vvau-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model.set_tensor_datatype("weights", wdt) + + model.set_initializer("weights", W) + model.set_tensor_shape("weights", (channels, 1, k_h, k_w)) + + if T is not None: + model.set_tensor_datatype("thresh", tdt) + model.set_initializer("thresh", T) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# mem_mode: const or decoupled +@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.UINT8]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType.INT4]) +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType.UINT4, None]) +# PE +@pytest.mark.parametrize("pe", [1, "channels"]) +# Input image shape +@pytest.mark.parametrize("dim_h", [10]) +@pytest.mark.parametrize("dim_w", [10, 1]) +# Kernel shape +@pytest.mark.parametrize("k_h", [3]) +@pytest.mark.parametrize("k_w", [3, 1]) +# Number of input and output channels +@pytest.mark.parametrize("channels", [3, 4]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_vvau( + idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode +): + if pe == "channels": + pe = channels + + if dim_w == 1 and k_w != 1: + pytest.skip("1D image requires 1D kernel, skipping.") + + if channels % pe != 0: + pytest.skip("Requirement Channels divisable by PE is violated.") + + # Generate weights in expected shape for ONNX and HLS node + W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w)) # shape: [channels, 1, k, k] + W_onnx = _infer_sparse_weight_tensor( + W, k_h, k_w, channels + ) # shape: [k*k*channels, channels] + + # Generate inputs in expected format for ONNX and HLS node + x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels)) + x_vvau = x.reshape(1, dim_h, dim_w, k_h * k_w, channels // pe, pe) + x_vvau = x_vvau.transpose(0, 1, 2, 4, 3, 5) + x_vvau = x_vvau.reshape(1, dim_h, dim_w, channels * k_h * k_w) + + if act is None: + T = None + tdt = None + odt = DataType.INT32 + else: + odt = act + (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32) + T = np.sort(T, axis=1) + tdt = DataType.INT32 + + model = _make_single_vvau_modelwrapper( + W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt + ) + + if exec_mode == "cppsim": + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode in test_fpgadataflow_vvau") + + input_dict = prepare_inputs(x_vvau) + + # Calculate output + y_expected = np.matmul(x, W_onnx) # Y is in [N, H, W, C] format + if T is not None: + # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format + y_expected = np.transpose(y_expected, (0, 3, 1, 2)) + y_expected = multithreshold(y_expected, T) + y_expected = np.transpose(y_expected, (0, 2, 3, 1)) + # signed offset + y_expected += act.min() + + y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[ + "outp" + ] + + assert (y_produced == y_expected).all(), "cppsim failed" + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0