diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 132d5bdaa286ba3e50bbd06971e9139f5859ef11..b312737c317517ca0ab19c74cf22284b5977b661 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -15,7 +15,7 @@ gecho () { # the repos themselves are cloned in the Dockerfile BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=8aed899c278c36c977a249558d71795086cf852c +HLSLIB_COMMIT=8f9f2018762f654f196b666838aeaf6fc730ad9a PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index 7c3123cd5eb29a54dc5cbfb912225ad3fdb0f219..61c5f26192965f1b3ebf81c35a25417c43e9d59a 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -103,6 +103,7 @@ def execute_node(node, context, graph): raise Exception( """Output shapes disagree after node execution: found %s vs expected %s""" + % (str(output_list[list_ind].shape), str(context[outp].shape),) % (str(output_list[list_ind].shape), str(context[outp].shape)) ) context[outp] = output_list[list_ind] diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..ad68a4bde29123b2498ac7789048bcd2e13bf3bc --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -0,0 +1,576 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import ceil +import os + +import numpy as np + +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + rtlsim_output_to_npy, +) +from . import templates + +# ONNX i/o tensor shape assumptions for channelwise ops: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +class ChannelwiseOp_Batch(HLSCustomOp): + """Class that corresponds to finn-hls Thresholding_Batch function. + It can implement a variety of channel-wise parametrized operations, + including Add, Mul and multi-thresholding. + """ + + def __init__(self, onnx_node): + super().__init__(onnx_node) + self.decoupled_wrapper = templates.decoupled_wrapper + + def get_nodeattr_types(self): + my_attrs = { + # channelwise "map" function to apply: + # one of cmp_le, cmp_ge, add, mul + "Func": ("s", False, "cmp_le"), + "PE": ("i", True, 0), + "NumChannels": ("i", True, 0), + # string defining memory resource type for parameters + "ram_style": ("s", False, "distributed"), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "paramDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # input and output FIFO depths + "inFIFODepth": ("i", False, 0), + "outFIFODepth": ("i", False, 0), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM, the depth of the memory used + to store the channelwise op parameters.""" + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return chn // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt_name = self.get_input_datatype().name + exp_idt_name = self.get_nodeattr("inputDataType") + assert exp_idt_name == idt_name, "Bad input DataType for ChannelwiseOp layer" + # TODO: dynamically infer/update odt based on idt as done in ConvertToHLSLayers? + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("paramDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required Threshold_Batch attributes do not exist.""" + ) + + return info_messages + + def bram_estimation(self): + """Calculates BRAM cost if resource set to BRAM""" + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + + if style == "block" and tmem > 1: + return int(ceil(A * P / 16)) * int(ceil(tmem / 1024)) + else: + return 0 + + def lut_estimation(self): + """Calculates LUT cost, taking memory resource type into account """ + # TODO add in/out FIFO contributions + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + # cost of comparators + comparator_cost = A * P + # cost of LUTRAM + if style == "distributed" and tmem > 1: + lutram_cost = P * A * int(ceil(tmem / 64)) + else: + lutram_cost = 0 + # total cost + return comparator_cost + lutram_cost + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + # fill in TSrcI + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_hls_compatible_parameter_tensor(self, orig_param_vector): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure chn % PE == 0 + * interleave rows between PEs + * reshape into (PE, TMEM) and return + """ + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = chn // pe + assert chn % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_param_vector.ndim == 1 + ), """Parameter vector dimension is {}. + Expected dimension: 1.""".format( + orig_param_vector.ndim + ) + + # if not self.get_input_datatype().signed(): + # # ensure all thresholds are nonnegative + # assert (orig_param_vector >= 0).all() + + # ensure all thresholds are integer + assert (orig_param_vector.astype(np.int32) == orig_param_vector).all() + ret = orig_param_vector + + assert ( + ret.shape[0] == chn + ), "Cardinality of parameter vector is not as expected (chn)" + + # distribute rows between PEs + ret = ret.reshape(tmem, pe).transpose() + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + + return ret.reshape(1, pe, tmem) + + def generate_params(self, model, path): + code_gen_dir = path + # save thresholds in params.h + parameters = model.get_initializer(self.onnx_node.input[1]) + parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters) + pdt = DataType[self.get_nodeattr("paramDataType")] + + parameters_hls_code = numpy_to_hls_code( + parameter_tensor, pdt, "parameters", False, True + ) + # get input data type + export_idt = self.get_input_datatype() + if self.get_input_datatype() == DataType.BIPOLAR: + export_idt = DataType.BINARY + idt_hls = export_idt.get_hls_datatype_str() + + # write parameters into params.h + f_params = open("{}/params.h".format(code_gen_dir), "w") + pdt_hls = pdt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType.BIPOLAR: + export_odt = DataType.BINARY + odt_hls = export_odt.get_hls_datatype_str() + # get desired function + func = self.get_nodeattr("Func") + if func == "cmp_le": + func_str = "std::less_equal" + elif func == "cmp_ge": + func_str = "std::greater_equal" + elif func == "add": + func_str = "std::plus" + elif func == "mul": + func_str = "std::multiplies" + else: + raise Exception( + """Invalid value for attribute Func! Is currently set to: {} + has to be set to one of the following value + ("cmp_le", "cmp_ge", "add", "mul")""".format( + func + ) + ) + f_params.write( + "static ChannelWiseOperation<{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + idt_hls, + pdt_hls, + odt_hls, + "%s<%s>" % (func_str, odt_hls), + ) + ) + f_params.write(parameters_hls_code) + f_params.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for ChannelwiseOp_Batch") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType.BIPOLAR: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_folded_output_shape() + ), """Output shape is not as expected""" + # reshape output to have expected shape + oshape = self.get_normal_output_shape() + context[node.output[0]] = context[node.output[0]].reshape(*oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] + + # TODO check and add whatever missing + def defines(self, var): + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = numInputVectors[0] + self.code_gen_dict["$DEFINES$"] = [ + """#define NumChannels1 {}\n#define PE1 {}\n#define numReps {}""".format( + self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + tmpl_args = self.get_template_param_values() + # TODO: why put some template parameters into defines and not others? + # should ImgDim be defined or just filled in here like we do now? + ishape = self.get_folded_input_shape() + if len(ishape) == 3: + imgdim = 1 + elif len(ishape) == 5: + imgdim = ishape[1] + else: + raise Exception("""Unexpeted input shape""") + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}> + (in0, out, threshs, numReps);""".format( + imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"], + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL] + # partition for parallel access along PE and N_PARAMS_PER_CHANNEL + # dimensions (dims 1 and 3) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " + "complete dim=1" + ) + ) + # self.code_gen_dict["$PRAGMAS$"].append( + # ( + # "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " + # "complete dim=3" + # ) + # ) + + # set resource type + ram_style = self.get_nodeattr("ram_style") + pe = self.get_nodeattr("PE") + ich = self.get_nodeattr("NumChannels") + # if PE less than NumChannels, assign cores according to ram_style; + # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs + if pe < ich: + if ram_style == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.parameters " + "core=ROM_2P_LUTRAM" + ) + ) + elif ram_style == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.parameters " + "core=ROM_2P_BRAM" + ) + ) + else: + raise Exception( + """Invalid value for attribute ram_style! Is currently set to: {} + has to be set to one of ("block", "distributed")""".format( + ram_style + ) + ) diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py index 9e6c63dc510aab5f6baff9cb6326a2d0476f67a9..83152dea6cc494b8464c78605399b21b38d48b80 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py @@ -75,16 +75,19 @@ class GlobalAccPool_Batch(HLSCustomOp): def get_normal_output_shape(self): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple([vecs[0]] + [ch]) + if len(vecs) == 1: + oshape = tuple(vecs + [ch]) + elif len(vecs) == 3: + oshape = tuple([vecs[0]] + [1, 1, ch]) return oshape def get_folded_output_shape(self): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) + unfolded_shape = list(self.get_normal_output_shape()) assert ch % pe == 0, "PE must divide NumChannels" folds = int(ch / pe) - oshape = tuple([vecs[0]] + [folds, pe]) + oshape = tuple(unfolded_shape[:-1] + [folds, pe]) return oshape def make_shape_compatible_op(self, model): diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 0060e5d400f30055d532671c8cf1680f0668442a..d1f8f02a00810804163918d5fd4336ab6523bde0 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -51,6 +51,7 @@ from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.quantavgpool2d import QuantAvgPool2d from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch +from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch # create a mapping of all known CustomOp names and classes custom_op = {} @@ -74,6 +75,7 @@ custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["QuantAvgPool2d"] = QuantAvgPool2d custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch +custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch def getCustomOp(node): diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index b70b126680d650547cf376dd601c048c73a1cfd4..afb37b1ec0e0fc19a170b19337bceafd34f11a0e 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from onnx import helper, TensorProto +import numpy as np from finn.core.datatype import DataType from finn.transformation import Transformation @@ -34,6 +35,8 @@ from finn.custom_op.registry import getCustomOp from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes import finn.core.data_layout as DataLayout +from finn.util.onnx import nchw_to_nhwc +import warnings from finn.util.basic import get_by_name @@ -627,3 +630,243 @@ class InferThresholdingLayer(Transformation): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferChannelwiseLinearLayer(Transformation): + """Convert any channel-wise Add/Mul into a HLS layer.""" + + def get_smallest_possible(self, vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.__members__: + dt = DataType[k] + + if dt in [DataType.BIPOLAR, DataType.TERNARY, DataType.FLOAT32]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType.UINT32 + else: + return DataType.INT32 + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Add" or node.op_type == "Mul": + # assuming input[0] is dynamic + ll_input = node.input[0] + ll_output = node.output[0] + ll_in_shape = model.get_tensor_shape(ll_input) + + # check if input 1 has an initializer + ll_const = node.input[1] + if ll_const is not None: + ll_cinit = model.get_initializer(ll_const) + if ll_cinit is None: + # input 1 is also dynamic + continue + else: + continue + + # get number of channels and channel index from input + ll_in_layout = model.get_tensor_layout(ll_input) + if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: + ch_index = -1 + ch = ll_in_shape[-1] + elif ll_in_layout == DataLayout.NCHW: + ch_index = 1 + ch = ll_in_shape[1] + else: + continue + + # check if the shape of initializer is compatible + ll_cinit_shape = list(ll_cinit.shape) + if np.prod(ll_cinit_shape) == 1: + warnings.warn( + "Broadcasting " + str(node.op_type) + "(" + node.name + ")" + ) + ll_cinit = np.full((ch), ll_cinit.flatten()[0]) + elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: + # parameter shape not compatible with Channelwise_batch + continue + + # check initializer contains integers as floats + if not (ll_cinit.astype(np.int32) == ll_cinit).all(): + continue + # all initializer conditions are met + + # check inputs + idt = model.get_tensor_datatype(ll_input) + if not idt.is_integer(): + # skip conversion for layers with float input + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + if ll_in_layout == DataLayout.NCHW: + ll_input = nchw_to_nhwc(ll_input, model, node_ind) + node_ind += 1 + ll_in_shape = model.get_tensor_shape(ll_input) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + ll_output_layout = model.get_tensor_layout(ll_output) + if ll_output_layout == DataLayout.NCHW: + ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) + node_ind += 1 + + # get parameter data type + param_min = min(ll_cinit.flatten()) + param_max = max(ll_cinit.flatten()) + pdt = self.get_smallest_possible([param_min, param_max]) + + # set function and determine output data type + if node.op_type == "Add": + func = "add" + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = self.get_smallest_possible([out_min, out_max]) + elif node.op_type == "Mul": + func = "mul" + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = self.get_smallest_possible(possible_limits) + + model.set_initializer(ll_const, ll_cinit.reshape(ch)) + model.set_tensor_datatype(ll_output, odt) + + # create node with no parallelization first + pe = 1 + assert ch % pe == 0, "Requirement IFC divisable by PE is violated." + # create and insert node + new_node = helper.make_node( + "ChannelwiseOp_Batch", + [ll_input, ll_const], + [ll_output], + domain="finn", + backend="fpgadataflow", + Func=func, + NumChannels=ch, + PE=pe, + inputDataType=idt.name, + paramDataType=pdt.name, + outputDataType=odt.name, + numInputVectors=list(ll_in_shape[:-1]), + ) + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferGlobalAccPoolLayer(Transformation): + """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "GlobalAveragePool": + in0 = node.input[0] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + + idt = model.get_tensor_datatype(in0) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + num_ch = int(in0_shape[-1]) + vecs = in0_shape[:-1] + # create node with no parallelization first + pe = 1 + assert ( + num_ch % pe == 0 + ), "Requirement Labels divisable by PE is violated." + + # create an additional tensor of the same shape and layout as result + out_shape = model.get_tensor_shape(result) + pool_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(pool_out) + pool_out = pool_out.name + model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) + + new_pool = helper.make_node( + "GlobalAccPool_Batch", + [in0], + [pool_out], + domain="finn", + backend="fpgadataflow", + NumChannels=num_ch, + PE=pe, + inputDataType=idt.name, + numInputVectors=vecs, + ) + + mul_value = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] + ) + model.graph.value_info.append(mul_value) + model.set_initializer(mul_value.name, np.array(1 / (vecs[1] * vecs[2]))) + new_mul = helper.make_node("Mul", [pool_out, mul_value.name], [result],) + graph.node.insert(insert_point, new_pool) + graph.node.insert(insert_point + 1, new_mul) + node_ind += 1 + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/util/onnx.py b/src/finn/util/onnx.py index b9932111d86d7206b23e1d0e49a6aa8451f8ba24..4d7cdd126ededac887639a932c2021ef5f081c02 100644 --- a/src/finn/util/onnx.py +++ b/src/finn/util/onnx.py @@ -28,6 +28,7 @@ import numpy as np import onnx +import finn.core.data_layout as DataLayout def valueinfo_to_tensor(vi): @@ -37,3 +38,38 @@ def valueinfo_to_tensor(vi): return np.zeros( dims, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[vi.type.tensor_type.elem_type] ) + + +def nchw_to_nhwc(t, model, idx, reverse=False): + """Converts between NCHW <-> NHWC layouts for tensor t by inserting a transpose. + If reverse=False, t is assumed NCHW and we insert transpose to convert NCHW -> NHWC + If reverse=True, t is assumed NHWC and we insert transpose to convert NHWC -> NCHW. + """ + graph = model.graph + # create new NHWC tensor + t_shape = model.get_tensor_shape(t) + bs = t_shape[0] + ch = t_shape[1] + height = t_shape[2] + width = t_shape[3] + t_trans = onnx.helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + onnx.TensorProto.FLOAT, + (bs, height, width, ch), # NHWC + ) + graph.value_info.append(t_trans) + dt = model.get_tensor_datatype(t) + t_trans = t_trans.name + model.set_tensor_datatype(t_trans, dt) + model.set_tensor_layout(t_trans, DataLayout.NHWC) + # NCHW <-> NHWC transpose + if reverse: + t_trans_node = onnx.helper.make_node( + "Transpose", [t_trans], [t], perm=[0, 3, 1, 2] + ) + else: + t_trans_node = onnx.helper.make_node( + "Transpose", [t], [t_trans], perm=[0, 2, 3, 1] + ) + graph.node.insert(idx, t_trans_node) + return t_trans diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..d09c64a1250f78604c1a0a362cf234712de2cf57 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py @@ -0,0 +1,115 @@ +import pytest + +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.infer_shapes import InferShapes +import numpy as np + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape) + p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape) + + model = helper.make_model( + helper.make_graph( + name="test", + inputs=[inp], + outputs=[outp], + value_info=[p0], + nodes=[helper.make_node(onnx_op_name, ["inp", "p0"], ["outp"])], + ) + ) + + model = ModelWrapper(model) + model.set_initializer("p0", gen_finn_dt_tensor(pdt, pshape)) + model.set_tensor_datatype("inp", idt) + model.transform(InferDataLayouts(), make_deepcopy=False) + model.transform(InferShapes(), make_deepcopy=False) + return model + + +# parameter datatype +@pytest.mark.parametrize("pdt", [DataType.BIPOLAR, DataType.UINT4, DataType.INT2]) +# input datatype +@pytest.mark.parametrize("idt", [DataType.INT32, DataType.UINT4, DataType.INT4]) +# function +@pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"]) +# vector parameter or scalar parameter (broadcast) +@pytest.mark.parametrize("scalar_param", [True, False]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.vivado +@pytest.mark.slow +def test_convert_to_hls_channelwise_layer( + pdt, idt, onnx_op_name, scalar_param, exec_mode +): + ifm_ch = 16 + ifm_dim = 5 + ishape = (1, ifm_ch, ifm_dim, ifm_dim) + if scalar_param: + pshape = (1,) + else: + pshape = (1, ifm_ch, 1, 1) + + np.random.seed(0) + model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) + + # Since the aren't Data types with a bit width of a non power of 2, + # there are cases where the input won't use it full range. + if idt == DataType.INT32: + x = gen_finn_dt_tensor(DataType.INT16, (1, ifm_ch, ifm_dim, ifm_dim)) + elif idt == DataType.UINT32: + x = gen_finn_dt_tensor(DataType.UINT16, (1, ifm_ch, ifm_dim, ifm_dim)) + else: + x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim)) + + input_dict = prepare_inputs(x) + y_expected = oxe.execute_onnx(model, input_dict)["outp"] + + new_model = model.transform(to_hls.InferChannelwiseLinearLayer()) + new_model = new_model.transform(GiveUniqueNodeNames()) + + if exec_mode == "cppsim": + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + new_model = new_model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(ReplaceVerilogRelPaths()) + new_model = new_model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + ctx_produced = oxe.execute_onnx( + new_model, input_dict, return_full_exec_context=True + ) + y_produced = ctx_produced["outp"] + + assert (y_produced == y_expected).all() + assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch" diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..6a69d8180a4a9825a83b419666bbccb9f203a7d8 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -0,0 +1,152 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) + + +def make_modelwrapper(C, pe, idt, odt, func, vecs): + NumChannels = C.shape[0] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, vecs + [NumChannels]) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, vecs + [NumChannels] + ) + + node_inp_list = ["inp", "const"] + + node = helper.make_node( + "ChannelwiseOp_Batch", + node_inp_list, + ["outp"], + domain="finn", + backend="fpgadataflow", + NumChannels=NumChannels, + Func=func, + PE=pe, + inputDataType=idt.name, + outputDataType=odt.name, + numInputVectors=vecs, + ) + graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp],) + + model = helper.make_model(graph, producer_name="model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + model.set_tensor_datatype("const", idt) + model.set_initializer("const", C) + return model + + +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType.INT8]) +# input datatype +@pytest.mark.parametrize("idt", [DataType.INT4]) +# folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2]) +# number of input features +@pytest.mark.parametrize("ich", [16]) +# vecs +@pytest.mark.parametrize("vecs", [[1], [1, 7, 7]]) +# function +@pytest.mark.parametrize("func", ["add", "mul"]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_addmul(idt, act, nf, ich, func, vecs, exec_mode): + if nf == -1: + nf = ich + pe = ich // nf + assert ich % pe == 0 + + # generate input data + x = gen_finn_dt_tensor(idt, tuple(vecs + [ich])) + + odt = act + C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32) + + model = make_modelwrapper(C, pe, idt, odt, func, vecs) + + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # package input data as dictionary + input_dict = {"inp": x} + + oshape = model.get_tensor_shape("outp") + + C_reshaped = np.broadcast_to(C.flatten(), x.shape) + if func == "add": + y = x + C_reshaped + elif func == "mul": + y = x * C_reshaped + + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim failed" + + if exec_mode == "rtlsim": + hls_synt_res_est = model.analysis(hls_synth_res_estimation) + assert "ChannelwiseOp_Batch_0" in hls_synt_res_est