diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 0074cce02f7de57dc778e0b671c484233df72a8a..132d5bdaa286ba3e50bbd06971e9139f5859ef11 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -13,9 +13,9 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -BREVITAS_COMMIT=026a509186b7e7b0b65d46a2f905043d41069306 +BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716 +HLSLIB_COMMIT=8aed899c278c36c977a249558d71795086cf852c PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 95594bb67a2be3a4c3fbba488c75a704f623c136..5beabeb2980840fd1dbd2ee5b058738fa8553152 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -18,6 +18,7 @@ Requirements * A working Vivado 2019.1 installation * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located) * (optional) A PYNQ board with a network connection + * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring`` Running FINN in Docker ====================== diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index efdfaa19d9f9e5dfa41911a2184e989337b3d9c2..7c3123cd5eb29a54dc5cbfb912225ad3fdb0f219 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -108,7 +108,9 @@ def execute_node(node, context, graph): context[outp] = output_list[list_ind] -def execute_onnx(model, input_dict, return_full_exec_context=False): +def execute_onnx( + model, input_dict, return_full_exec_context=False, start_node=None, end_node=None +): """Executes given ONNX ModelWrapper with given named inputs. If return_full_exec_context is False, a dict of named outputs is returned @@ -116,7 +118,12 @@ def execute_onnx(model, input_dict, return_full_exec_context=False): If return return_full_exec_context is True, the full set of tensors used by the execution (including inputs, weights, activations and final outputs) - will be returned as a dict.""" + will be returned as a dict. + + When start_node and end_node are set to None, the whole graph is executed. + If they are set to particular ONNX nodes, only the subgraph between (and + including) those nodes is executed. + """ if not model.check_all_tensor_shapes_specified(): raise Exception("Found unspecified tensor shapes, try infer_shapes") @@ -159,7 +166,17 @@ def execute_onnx(model, input_dict, return_full_exec_context=False): # execute the model node by node # we can simply walk down the list since the ONNX spec guarantees that it is # topologically sorted - for node in graph.node: + subgraph = [] + if start_node is None: + start_node = model.graph.node[0] + if end_node is None: + end_node = model.graph.node[-1] + # select the nodes between specified start/end nodes + start_ind = model.get_node_index(start_node) + end_ind = model.get_node_index(end_node) + 1 + assert end_ind >= start_ind, "Start/end nodes must define valid subgraph" + subgraph = graph.node[start_ind:end_ind] + for node in subgraph: if get_sanitize_quant_tensors() != 0: # round input values to match quantization annotation execution_context = sanitize_quant_values( diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py similarity index 88% rename from src/finn/custom_op/fpgadataflow/fmpadding.py rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..d326ae7dfc7830a0081c3b13233d67ef08b12eff 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp): "Padding": ("i", True, 2), # number of channels in input image "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), # FINN input datatype "inputDataType": ("s", True, ""), # controls distribution of padded pixels @@ -55,20 +57,22 @@ class FMPadding_Batch(HLSCustomOp): return oshape def get_folded_input_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_input_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) def get_folded_output_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_output_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() @@ -114,15 +118,13 @@ class FMPadding_Batch(HLSCustomOp): def get_instream_width(self): ibits = self.get_input_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return ibits * num_ch + simd = self.get_nodeattr("SIMD") + return ibits * simd def get_outstream_width(self): obits = self.get_output_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return obits * num_ch + simd = self.get_nodeattr("SIMD") + return obits * simd def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() @@ -135,13 +137,15 @@ class FMPadding_Batch(HLSCustomOp): self.code_gen_dict["$DEFINES$"] = [ """#define ImgDim1 {}\n#define OutputDim1 {}\n #define Padding1 {}\n#define NumChannels1 {}\n - #define PaddingStyle1 {}\n#define numReps {}\n""".format( + #define PaddingStyle1 {}\n#define numReps {} + #define SIMD1 {}\n""".format( self.get_nodeattr("ImgDim"), self.get_padded_odim(), self.get_nodeattr("Padding"), self.get_nodeattr("NumChannels"), self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), + self.get_nodeattr("SIMD"), ) ] @@ -176,7 +180,7 @@ class FMPadding_Batch(HLSCustomOp): in_t = self.get_input_datatype().get_hls_datatype_str() node = self.onnx_node self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, OutputDim1, Padding1, NumChannels1, + """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format( node.op_type, in_t ) @@ -232,6 +236,7 @@ class FMPadding_Batch(HLSCustomOp): node = self.onnx_node exp_ishape = self.get_normal_input_shape() exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() folded_oshape = self.get_folded_output_shape() if mode == "cppsim": @@ -254,10 +259,8 @@ class FMPadding_Batch(HLSCustomOp): match expected shape (1, ImgDim, ImgDim, NumChannels).""" export_idt = self.get_input_datatype() - # no reshaping for input since assuming no folding on input - # make copy before saving array - inp = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), inp) + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) if mode == "cppsim": # execute the precompiled model diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..c7edc24d0e24eef1154293caca2519ab3aa68358 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -0,0 +1,395 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import numpy as np + +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.core.datatype import DataType +from onnx import TensorProto, helper +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class Pool_Batch(HLSCustomOp): + """Class that corresponds to finn-hlslib Pool_batch function. + Requires ConvolutionInputGenerator(depthwise == 1) to format its input + + TODO: explain input shape (to reuse im2col code) + Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels) + Output shape (BatchSize,OutImgDim,OutImgDim,Channels) + + # note: the actual data layout produced by the hlslib kernels is different + # for depthwise ops. + # * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + + Channels can be folded using PE (SIMD from the input perspective) + TODO: doc + """ + + def get_nodeattr_types(self): + my_attrs = { + "Channels": ("i", True, 0), + "PE": ("i", True, 1), + "KernelSize": ("i", True, 0), + # Function: + # - MaxPool + # - AvgPool (not yet supported, but HLSLIB does) + # - AccPool (not yet supported, but HLSLIB does) + "Function": ("s", True, ""), + "OutImgDim": ("i", True, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + "BatchSize": ("i", False, 1), + } + + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + # Same as input + return DataType[self.get_nodeattr("dataType")] + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + def get_normal_input_shape(self): + ifm_ch = self.get_nodeattr("Channels") + odim = self.get_nodeattr("OutImgDim") + batch_size = self.get_nodeattr("BatchSize") + k = self.get_nodeattr("KernelSize") + ishape = (batch_size, odim, odim, k * k * ifm_ch) + return ishape + + def get_folded_input_shape(self): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(normal_ishape[-1] / pe) + folded_ishape = normal_ishape[:-1] + [fold, pe] + return tuple(folded_ishape) + + def get_normal_output_shape(self): + ofm_ch = self.get_nodeattr("Channels") + odim = self.get_nodeattr("OutImgDim") + batch_size = self.get_nodeattr("BatchSize") + oshape = (batch_size, odim, odim, ofm_ch) + return oshape + + def get_folded_output_shape(self): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(ifm_ch / pe) + folded_oshape = normal_oshape[:-1] + [fold, pe] + return tuple(folded_oshape) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[1:-1]) + + def get_instream_width(self): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + # ofm_ch = self.get_nodeattr("Channels") + # k = self.get_nodeattr("KernelSize") + # assert ifm_ch % pe == 0, "PE must divide input channels" + # simd = int(ifm_ch/pe) + in_width = int(dt_bits * pe) + return in_width + + def get_outstream_width(self): + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + return self.get_instream_width() + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + info_messages = [] + + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""Pool_Batch needs 1 data input""") + + # check supported function + fnx = self.get_nodeattr("Function") + if fnx == "MaxPool": + info_messages.append( + "Attribute Function contains a supported pool function" + ) + else: + info_messages.append( + "Attribute Function contains an unsupported pool function" + ) + return info_messages + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "pool.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("Channels") + self.code_gen_dict["$DEFINES$"] += ["#define Channels {}".format(ifm_ch)] + + pe = self.get_nodeattr("PE") + self.code_gen_dict["$DEFINES$"] += ["#define PE {}".format(pe)] + + k = self.get_nodeattr("KernelSize") + self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k)] + + odim = self.get_nodeattr("OutImgDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + numReps = self.get_nodeattr("BatchSize") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(numReps)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + idt = self.get_input_datatype() + i_hls_dt = idt.get_hls_datatype_str() + odt = self.get_output_datatype() + o_hls_dt = odt.get_hls_datatype_str() + + self.code_gen_dict["$DOCOMPUTE$"] = [] + + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + self.code_gen_dict["$DOCOMPUTE$"] += [ + "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt) + ] + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + self.code_gen_dict["$DOCOMPUTE$"] += [ + """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > > + (in0,out, pool_fxn, OFMDim*OFMDim*numReps);""".format( + i_hls_dt, o_hls_dt + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_ibits = self.get_instream_width() + packed_in_hls_type = "ap_uint<%d>" % packed_ibits + + packed_obits = self.get_outstream_width() + packed_out_hls_type = "ap_uint<%d>" % packed_obits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (batch_size,odim,odim,k*k*ifm_ch).""" + + export_idt = self.get_input_datatype() + reshaped_input = inp.reshape(folded_ishape) + + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 9b73ba1e100aa83fd19aa8799195c99891fca3fd..b6c992e4b5ea1ced088928b3d9a4db381d82db22 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -290,12 +290,15 @@ class StreamingFCLayer_Batch(HLSCustomOp): return out_width def get_weightstream_width(self): - """Returns weight stream width. Used in decoupled mode.""" - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wp = self.get_weight_datatype().bitwidth() - w_width = pe * simd * wp - return w_width + """Returns weight stream width. Used only in decoupled mode.""" + if self.get_nodeattr("mem_mode") == "decoupled": + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * simd * wp + return w_width + else: + return 0 def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 1a8216f64bf71b7fb9f1f8becf4732970b5bf451..1da60a5124fa86b4336bae8fd1a587672f2f2e6f 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -99,6 +99,7 @@ set_top $config_toplevelfxn open_solution sol1 set_part $config_proj_part +config_compile -ignore_long_run_time -disable_unroll_code_size_check config_interface -m_axi_addr64 config_rtl -auto_prefix $EXTRA_DIRECTIVES$ diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 25ea05e3607a52731ae1b64de421837bf137ee2b..17ba44b959577faf573d77ae222f7b2a3be6669d 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -30,20 +30,30 @@ from finn.custom_op.fpgadataflow import HLSCustomOp class TLastMarker(HLSCustomOp): - """Class that corresponds to the TLastMarker node that needs to be - inserted at the end of the model for rtlsim with stitched IP. - It marks the end of the current image/input sample.""" + """Node that adds/removes AXI stream TLAST signals where needed. Its behavior + is transparent in node-by-node execution, only visible in IP-stitched rtlsim or + actual hardware. + This node may be needed at the end of the network to signal a DMA write (needed by the + FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read.""" def __init__(self, onnx_node): super().__init__(onnx_node) def get_nodeattr_types(self): my_attrs = { + # number of (static) iterations until TLAST=1 is generated for Direction=out "NumIters": ("i", True, 0), + # whether static or dynamic (from AXI lite) number of iterations are used + "DynIters": ("i", False, 1), + # direction: whether to insert or remove TLAST + "Direction": ("s", False, "out"), # width of input-output data streams, in bits "StreamWidth": ("i", True, 0), # width of individual element in stream, in bits "ElemWidth": ("i", True, 0), + # Protocol: external or internal + # Vitis docs recommend using qdma_axis for external, ap_axiu for internal + "Protocol": ("s", False, "external"), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -76,12 +86,33 @@ class TLastMarker(HLSCustomOp): def defines(self, var): stream_width = self.get_nodeattr("StreamWidth") + direction = self.get_nodeattr("Direction") + protocol = self.get_nodeattr("Protocol") # output stream must have TLAST, so we use this stream data type: # qdma_axis<stream_data_width,0,0,0 > - out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + if direction == "out": + if protocol == "external": + out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + out_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + in_stream_dtype = "ap_uint<%d>" % stream_width + elif direction == "in": + out_stream_dtype = "ap_uint<%d>" % stream_width + if protocol == "external": + in_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + in_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + else: + raise Exception("Unrecognized Direction in TLastMarker") + self.code_gen_dict["$DEFINES$"] = [ "#define StreamWidth %d" % stream_width, "#define OutDType %s" % out_stream_dtype, + "#define InDType %s" % in_stream_dtype, "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"), ] @@ -89,27 +120,60 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$READNPYDATA$"] = [] def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - "unsigned int n = 1;", - "OutDType t;", - "t.set_keep(-1);", - "io_section: { // start of cycle accurate region", - "#pragma HLS protocol fixed", - "// do a first read from stream before we decide on numIters", - "// giving software a chance to set up the numIters prior to startup", - "t.set_data(in0.read());", - "n = (numIters == 0 ? NumItersPerImg : numIters);", - "t.set_last(n==1);", - "out.write(t);", - "} // end of cycle accurate region", - "// do one less iteration than spec since we already did one", - "for(unsigned int i=1; i<n; i++) {", - "#pragma HLS PIPELINE II=1", - "t.set_data(in0.read());", - "t.set_last(i==(n-1));", - "out.write(t);", - "}", - ] + dyn_iters = self.get_nodeattr("DynIters") + direction = self.get_nodeattr("Direction") + use_qdma_axis = self.get_nodeattr("Protocol") == "external" + if direction == "in": + # read from input and just pass data along; ignore tlast + # no dyn iters on input, it doesnt make sense + self.code_gen_dict["$DOCOMPUTE$"] = [ + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "out.write(in0.read().get_data());" + if use_qdma_axis + else "out.write(in0.read().data);", + "}", + ] + + elif dyn_iters == 1: + # output, with dynamic iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "io_section: { // start of cycle accurate region", + "#pragma HLS protocol fixed", + "// do a first read from stream before we decide on numIters", + "// giving software a chance to set up the numIters prior to startup", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "n = (numIters == 0 ? NumItersPerImg : numIters);", + "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);", + "out.write(t);", + "} // end of cycle accurate region", + "// do one less iteration than spec since we already did one", + "for(unsigned int i=1; i<n; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));", + "out.write(t);", + "}", + ] + + else: + # output, with static iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(NumItersPerImg-1));" + if use_qdma_axis + else "t.last = (i==(NumItersPerImg-1));", + "out.write(t);", + "}", + ] def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = [] @@ -118,18 +182,30 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void %s(hls::stream<ap_uint<StreamWidth> > &in0, - hls::stream<OutDType> &out, unsigned int numIters)""" - % self.onnx_node.name - ] + dyn_iters = self.get_nodeattr("DynIters") + + if dyn_iters == 1: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, + hls::stream<OutDType> &out, unsigned int numIters)""" + % self.onnx_node.name + ] + else: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)""" + % self.onnx_node.name + ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" - ) + + dyn_iters = self.get_nodeattr("DynIters") + if dyn_iters == 1: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" + ) + self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) @@ -158,7 +234,7 @@ class TLastMarker(HLSCustomOp): def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream<InDType> in0 ("in0");' ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<OutDType> out ("out");' diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py index 82a6b140f7af1be4e5c0f429d077b99c7865383e..8ed0041704d421dab587f08bcbcd9e739e8434e9 100644 --- a/src/finn/custom_op/im2col.py +++ b/src/finn/custom_op/im2col.py @@ -80,6 +80,8 @@ class Im2Col(CustomOp): "input_shape": ("s", True, ""), "pad_amount": ("i", False, 0), "pad_value": ("i", False, 0), + # depthwise: if != 0, infer ConvolutionInputGenerator with depthwise == 1 + "depthwise": ("i", False, 0), } def make_shape_compatible_op(self, model): diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py index 3bc328a9f4f6670041d33491d58af6c553bafac9..fb5c78bc0c8419ba519c5c3113d9b0c7ae2dd3b7 100644 --- a/src/finn/custom_op/quantavgpool2d.py +++ b/src/finn/custom_op/quantavgpool2d.py @@ -4,6 +4,7 @@ import onnxruntime as rt from finn.custom_op import CustomOp from finn.core.datatype import DataType +from finn.custom_op.maxpoolnhwc import compute_pool_output_dim class QuantAvgPool2d(CustomOp): @@ -16,20 +17,51 @@ class QuantAvgPool2d(CustomOp): "kernel": ("i", True, 1), "ibits": ("i", True, 1), "obits": ("i", True, 1), + # determines if values are signed (set to "1") or unsigned ("0") "signed": ("i", True, 0), + # data layout attribute can be set to "NCHW" or "NHWC" + "data_layout": ("s", False, "NCHW"), } def make_shape_compatible_op(self, model): node = self.onnx_node k = self.get_nodeattr("kernel") s = self.get_nodeattr("stride") - return helper.make_node( - "AveragePool", - inputs=[node.input[0]], - outputs=[node.output[0]], - kernel_shape=[k, k], - strides=[s, s], - ) + data_layout = self.get_nodeattr("data_layout") + if data_layout == "NCHW": + return helper.make_node( + "AveragePool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=[k, k], + strides=[s, s], + ) + elif data_layout == "NHWC": + iname = node.input[0] + ishape = model.get_tensor_shape(iname) + (n, hi, wi, c) = ishape + ho = compute_pool_output_dim(hi, k, s) + wo = compute_pool_output_dim(wi, k, s) + oshape = (n, ho, wo, c) + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + else: + raise Exception( + """Datalayout for QuantAvgPool2d is set to an invalid value. + Has to be set to "NCHW" or "NHWC".""" + ) def infer_node_datatype(self, model): node = self.onnx_node @@ -48,8 +80,12 @@ class QuantAvgPool2d(CustomOp): node = self.onnx_node k = self.get_nodeattr("kernel") s = self.get_nodeattr("stride") - ishape = context[node.input[0]].shape + inp_values = context[node.input[0]] oshape = context[node.output[0]].shape + if self.get_nodeattr("data_layout") == "NHWC": + inp_values = inp_values.transpose(0, 3, 1, 2) + oshape = (context[node.output[0]]).transpose(0, 3, 1, 2).shape + ishape = inp_values.shape inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) node_avgpool = helper.make_node( @@ -66,7 +102,7 @@ class QuantAvgPool2d(CustomOp): outputs=[outp], ) model_avgpool = helper.make_model(graph_avgpool) - idict = {node.input[0]: context[node.input[0]]} + idict = {node.input[0]: inp_values} sess = rt.InferenceSession(model_avgpool.SerializeToString()) result_temp = sess.run(None, idict) # remove scaling introduced by average @@ -77,7 +113,16 @@ class QuantAvgPool2d(CustomOp): max_bit_width = int(max_value).bit_length() shift_bits = max_bit_width - self.get_nodeattr("obits") result = np.right_shift(result_temp.astype(int), shift_bits) + if self.get_nodeattr("data_layout") == "NHWC": + result = result.transpose(0, 2, 3, 1) context[node.output[0]] = result.astype(np.float32) def verify_node(self): - pass + info_messages = [] + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + return info_messages diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 2dae826cf9712bef17d0053a0878c41ef51fec36..0060e5d400f30055d532671c8cf1680f0668442a 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -44,7 +44,8 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch -from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch +from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch +from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch @@ -66,6 +67,7 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch +custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["AddStreams_Batch"] = AddStreams_Batch diff --git a/src/finn/transformation/change_datalayout.py b/src/finn/transformation/change_datalayout.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b393a25e57122b059a44f70904a6dbe5bbaa3f --- /dev/null +++ b/src/finn/transformation/change_datalayout.py @@ -0,0 +1,110 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from onnx import helper, TensorProto + +from finn.transformation import Transformation +from finn.transformation.infer_shapes import InferShapes +from finn.util.basic import get_by_name + + +class ChangeDataLayoutQuantAvgPool2d(Transformation): + """Replace QuantAvgPool2d with datalayout (N,C,H,W) with Transpose nodes + and QuantAvgPool2dNHWC with datalayout (N,H,W,C)""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "QuantAvgPool2d" and ( + get_by_name(n.attribute, "data_layout") is None + or get_by_name(n.attribute, "data_layout").s.decode("UTF-8") == "NCHW" + ): + graph_modified = True + node_input = n.input[0] + node_output = n.output[0] + s = get_by_name(n.attribute, "stride").i + k = get_by_name(n.attribute, "kernel").i + ibits = get_by_name(n.attribute, "ibits").i + obits = get_by_name(n.attribute, "obits").i + signed = get_by_name(n.attribute, "signed").i + batchsize = model.get_tensor_shape(n.input[0])[0] # assume NCHW + channels = model.get_tensor_shape(n.input[0])[1] # assume NCHW + idim = model.get_tensor_shape(n.input[0])[-1] # assume NCHW + odim = model.get_tensor_shape(n.output[0])[-1] # assume NCHW + + # create new nodes + # NCHW -> NHWC + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (batchsize, idim, idim, channels), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + quantavg_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (batchsize, odim, odim, channels), + ) + graph.value_info.append(quantavg_out) + quantavg_out = quantavg_out.name + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + quantavg_node = helper.make_node( + "QuantAvgPool2d", + [inp_trans_out], + [quantavg_out], + domain="finn", + stride=s, + kernel=k, + ibits=ibits, + obits=obits, + signed=signed, + data_layout="NHWC", + ) + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [quantavg_out], [node_output], perm=[0, 3, 1, 2] + ) + # insert nodes + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, quantavg_node) + graph.node.insert(node_ind + 2, out_trans_node) + # remove old nodes + graph.node.remove(n) + + # set shapes + model.set_tensor_shape(inp_trans_out, (batchsize, idim, idim, channels)) + model.set_tensor_shape(quantavg_out, (batchsize, odim, odim, channels)) + model = model.transform(InferShapes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index d421a5f3ef8ca980b399087de1482b2ae913da1b..b70b126680d650547cf376dd601c048c73a1cfd4 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -34,6 +34,7 @@ from finn.custom_op.registry import getCustomOp from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes import finn.core.data_layout as DataLayout +from finn.util.basic import get_by_name class InferConvInpGen(Transformation): @@ -56,6 +57,7 @@ class InferConvInpGen(Transformation): k = i2c_inst.get_nodeattr("kernel_size") pad = i2c_inst.get_nodeattr("pad_amount") pad_val = i2c_inst.get_nodeattr("pad_value") + depthwise = i2c_inst.get_nodeattr("depthwise") ifm_ch = i2c_in_shape[-1] ifm_dim = i2c_in_shape[1] ofm_dim = i2c_out_shape[1] @@ -67,7 +69,11 @@ class InferConvInpGen(Transformation): if pad > 0: # if padding enabled, ensure pad_val supported by DataType - assert dt.allowed(pad_val), "Im2Col DataType must support pad_val" + # assert dt.allowed(pad_val),"""FMPadding_Batch DataType + # must support pad_val""" + assert ( + pad_val == 0 + ), "FMPadding_Batch doesn't currently support pad_val!= 0" odim_padding = ifm_dim + 2 * pad @@ -112,6 +118,7 @@ class InferConvInpGen(Transformation): Stride=stride, inputDataType=dt.name, outputDataType=dt.name, + depthwise=depthwise, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes @@ -169,6 +176,137 @@ class InferStreamingMaxPool(Transformation): return (model, graph_modified) +class InferPool_Batch(Transformation): + """If kernel_shape > strides, replace Pool layer with with of Im2col + + pool(with kernel_shape == strides), plus Transpose layers to keep the original + data layout.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type in ["MaxPool"]: + # extract pool parameters + k = get_by_name(n.attribute, "kernel_shape").ints[-1] + stride = get_by_name(n.attribute, "strides").ints[-1] + + if k <= stride: + continue + + try: + pad = get_by_name(n.attribute, "pads").ints[-1] + except AttributeError: + pad = 0 + + node_input = n.input[0] + node_output = n.output[0] + idt = model.get_tensor_datatype(node_input) + if not idt.is_integer(): + continue + + # odt = model.get_tensor_datatype(node_output) + + ifm_ch = model.get_tensor_shape(n.input[0])[1] # assume NCHW + ofm_ch = ifm_ch + ifm_dim = model.get_tensor_shape(n.input[0])[-1] # assume NCHW + ofm_dim = model.get_tensor_shape(n.output[0])[-1] # assume NCHW + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ifm_dim, ifm_dim, ifm_ch), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + model.set_tensor_datatype(inp_trans_out, idt) + + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim, ofm_dim, ifm_ch * k * k), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) + + pool_output = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim, ofm_dim, ofm_ch), + ) + graph.value_info.append(pool_output) + pool_output = pool_output.name + # model.set_tensor_datatype(pool_output, odt) + + # create new nodes + # NCHW -> NHWC + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + + if n.op_type == "MaxPool": + pool_fxn = "MaxPool" + pad_value = idt.min() + else: + raise Exception( + "pad_value and pool_fxn not configured for {}".format(n.op_type) + ) + + # format input tensor + im2col_node = helper.make_node( + "Im2Col", + [inp_trans_out], + [im2col_out], + domain="finn", + stride=stride, + kernel_size=k, + pad_amount=pad, + pad_value=pad_value, + depthwise=1, + input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch), + ) + + # Warning PE has to be equal to ifm_ch until Im2Col is replaced by + # ConvolutionInputGenerator with depthwise=1. + # For other settings the output will be incorrect due to incorrect input + # data layout + pool_node = helper.make_node( + "Pool_Batch", + [im2col_out], + [pool_output], + domain="finn", + backend="fpgadataflow", + dataType=idt.name, + Channels=ifm_ch, + PE=ifm_ch, + KernelSize=k, + Function=pool_fxn, + OutImgDim=ofm_dim, + BatchSize=1, + ) + + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] + ) + + # insert nodes where the conv is to preserve topological ordering + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, im2col_node) + graph.node.insert(node_ind + 2, pool_node) + graph.node.insert(node_ind + 3, out_trans_node) + # remove old node + graph.node.remove(n) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferBinaryStreamingFCLayer(Transformation): """Convert XnorPopcountMatMul layers to StreamingFCLayer_Batch layers. Any immediately following MultiThreshold diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 32f32ece585a93465ba32fede45d5eb606a2b0a3..04dd437af27b9fbe18b2255c20a8e4acda03b3d0 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -31,23 +31,34 @@ from onnx import helper as oh from finn.custom_op.registry import getCustomOp from finn.transformation import Transformation +from finn.util.basic import get_by_name + +import numpy as np class InsertTLastMarker(Transformation): - """Ensure that the graph is terminated with a TLastMarker node, inserting - one if necessary.""" + """Ensure that the graph is started/terminated with a TLastMarker node, inserting + one if necessary. Use constructor args to determine type of TLastMarker to be inserted. + More information available on the TLastMarker documentation. + """ - def __init__(self): + def __init__(self, both=False, external=True, dynamic=True): super().__init__() + self.dyniters = dynamic + self.external = external + self.both = both def apply(self, model): # TODO only makes sense for a pure fpgadataflow graph -- check! graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) - if final_node.op_type == "TLastMarker": - # TODO maybe check the correctness of properties - return (model, False) - else: + graph_modified = False + if final_node.op_type != "TLastMarker" and not ( + final_node.op_type == "IODMA" + and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") + == "out" + ): + custom_op = getCustomOp(final_node) num_iters = int(custom_op.get_number_output_values()) stream_width = int(custom_op.get_outstream_width()) @@ -69,8 +80,51 @@ class InsertTLastMarker(Transformation): NumIters=num_iters, StreamWidth=stream_width, ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="out", + Protocol=("external" if self.external else "internal"), domain="finn", backend="fpgadataflow", ) model.graph.node.append(tlast_node) - return (model, True) + graph_modified = True + # if both is True, also insert marker on input + if self.both: + graph_in_name = model.graph.input[0].name + first_node = model.find_consumer(graph_in_name) + if first_node.op_type != "TLastMarker" and not ( + first_node.op_type == "IODMA" + and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") + == "in" + ): + + custom_op = getCustomOp(first_node) + num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) + stream_width = int(custom_op.get_instream_width()) + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + elem_width = in_dtype.bitwidth() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + # reroute final node output to first_node_in_name + first_node.input[0] = first_node_in.name + tlast_node = oh.make_node( + "TLastMarker", + [graph_in_name], + [first_node_in.name], + NumIters=num_iters, + StreamWidth=stream_width, + ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="in", + Protocol=("external" if self.external else "internal"), + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(0, tlast_node) + graph_modified = True + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index 4f050be8540ddf5ef48699d1658b571852ff4510..6eae560e1191642cfaf85d92c6d0fcf644630973 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -80,7 +80,6 @@ class PrepareCppSim(Transformation): self._num_workers = mp.cpu_count() def prepareCppSim_node(self, node): - print(node.name) if is_fpgadataflow_node(node) is True: _codegen_single_node(node, self.model) return (node, False) diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py index 9ac75578ffb911cc44cfddc2b2119b55e6abf2dd..e7a6b88239a1735d5379e165333f8356ae6f88a1 100644 --- a/src/finn/transformation/infer_data_layouts.py +++ b/src/finn/transformation/infer_data_layouts.py @@ -38,7 +38,7 @@ def _dims_to_layout(model, node, ndims): return DataLayout.NC else: if node.domain == "finn": - if node.op_type == "MultiThreshold": + if node.op_type == "MultiThreshold" or node.op_type == "QuantAvgPool2d": mt_inst = registry.getCustomOp(node) layout = mt_inst.get_nodeattr("data_layout") if layout == "NHWC" and ndims == 4: diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index dbcf97361017144174f9fbfca35a84361b5abd26..4266488c7d1b86f2997d4c77d70b80f88bf37442 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -28,11 +28,13 @@ import numpy as np from onnx import helper as oh +import warnings from finn.core.datatype import DataType from finn.transformation import Transformation from finn.util.basic import get_by_name from finn.custom_op.registry import getCustomOp +from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes @@ -290,3 +292,38 @@ class AbsorbTransposeIntoMultiThreshold(Transformation): if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class AbsorbScalarMulIntoTopK(Transformation): + """Absorb a mul node into a suceeding topk node if the mul is scalar.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "TopK": + prod = model.find_producer(n.input[0]) + if prod is not None and prod.op_type == "Mul": + prod_input = prod.input[0] + param_name = prod.input[1] + A = model.get_initializer(param_name) + if A is None: + warnings.warn("Param is not constant, skipping") + continue + if all(x == 1 for x in A.shape) and A > 0: + # if the mul is scalar and positive, we can just delete the + # mul node and rewire the top k node. Because the top k node + # works with probabilities and their relation to each other + # the relation doesn't change if every value is multiplied + # with a scalar + graph.node.remove(prod) + n.input[0] = prod_input + # to avoid error the dataype is set to float32 + model.set_tensor_datatype(n.input[0], DataType.FLOAT32) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index b46b82c77a3f1b70a3b05d87cd3c48fc1d94fd45..a1bd16f6d0b70193122d5d067ccdee395260c7b1 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -32,6 +32,7 @@ from onnx import helper as oh from finn.transformation import Transformation from finn.transformation.infer_shapes import InferShapes +from finn.core.datatype import DataType from finn.core.onnx_exec import execute_node from finn.util.basic import get_by_name from finn.custom_op.registry import getCustomOp @@ -338,6 +339,71 @@ class MoveScalarMulPastConv(Transformation): return (model, graph_modified) +class MoveMulPastDWConv(Transformation): + """Move channelwise mul operations past depthwise conv operations. We want to have muls + next to each other such that they can be collapsed into a single mul.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type == "Mul" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + consumer = model.find_consumer(n.output[0]) + if ( + consumer is not None + and consumer.op_type == "Conv" + and not model.is_join_node(consumer) + ): + mul_weight_name = n.input[1] + A = model.get_initializer(mul_weight_name) + if A is None: + warnings.warn( + """Mul weight tensor is not set. If it is a constant, + please use set_initializer to set the tensor.""" + ) + continue + conv_node = consumer + mul_node = n + start_name = mul_node.input[0] + conv_in_name = conv_node.input[0] + conv_in_shape = model.get_tensor_shape(conv_in_name) + ifm_ch = conv_in_shape[1] + group_attribute = get_by_name(consumer.attribute, "group") + if group_attribute is None: + continue + group_attribute = group_attribute.i + conv_out_name = conv_node.output[0] + conv_out_shape = model.get_tensor_shape(conv_out_name) + if A.shape == (1, ifm_ch, 1, 1) and ifm_ch == group_attribute: + # if the mul is channelwise and conv is depthwise, + # we can simply swap the order of ops + # rewire mul input to be conv input + conv_node.input[0] = start_name + model.set_tensor_shape(start_name, conv_in_shape) + model.set_tensor_datatype(start_name, DataType.FLOAT32) + # use old conv input tensor as conv output + conv_node.output[0] = conv_in_name + model.set_tensor_shape(conv_in_name, conv_out_shape) + model.set_tensor_datatype(conv_in_name, DataType.FLOAT32) + # use new conv output as new mul node input + mul_node.input[0] = conv_in_name + # use old conv output as new mul node output + mul_node.output[0] = conv_out_name + model.set_tensor_datatype(conv_out_name, DataType.FLOAT32) + # move mul node past conv node + graph.node.remove(mul_node) + graph.node.insert(node_ind, mul_node) + graph_modified = True + model = model.transform(InferShapes()) + return (model, graph_modified) + + class MoveLinearPastEltwiseAdd(Transformation): """Move linear operations (mul, add) past elementwise add operations where possible. Specifically,matches and transforms the following patterns: diff --git a/tests/core/test_basic_onnx_exec.py b/tests/core/test_basic_onnx_exec.py index 7b0412432cc6360cb9c42d66417bd187ed142563..ddb2cbfc40c7647970f0c51ecb95340e7d1dddae 100644 --- a/tests/core/test_basic_onnx_exec.py +++ b/tests/core/test_basic_onnx_exec.py @@ -49,19 +49,33 @@ def test_mnist_onnx_download_extract_run(): raw_o = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/output_0.pb") input_tensor = onnx.load_tensor_from_string(raw_i) output_tensor = onnx.load_tensor_from_string(raw_o) - # run using FINN-based execution + # run using FINN-based execution (full graph) input_dict = {"Input3": np_helper.to_array(input_tensor)} - output_dict = oxe.execute_onnx(model, input_dict) + output_dict = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) assert np.isclose( np_helper.to_array(output_tensor), output_dict["Plus214_Output_0"], atol=1e-3 ).all() + # test subgraph execution + start_node = model.graph.node[1] + end_node = model.graph.node[3] + subgraph_i_dict = {start_node.input[0]: output_dict[start_node.input[0]]} + subgraph_o_dict = oxe.execute_onnx( + model, + subgraph_i_dict, + return_full_exec_context=True, + start_node=start_node, + end_node=end_node, + ) + assert np.isclose( + subgraph_o_dict[end_node.output[0]], output_dict[end_node.output[0]], atol=1e-3 + ).all() def test_onnx_exec_internal_rounding(): inp0 = onnx.helper.make_tensor_value_info("inp0", onnx.TensorProto.FLOAT, [2, 2]) inp1 = onnx.helper.make_tensor_value_info("inp1", onnx.TensorProto.FLOAT, [1]) outp = onnx.helper.make_tensor_value_info("outp", onnx.TensorProto.FLOAT, [2, 2]) - mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"],) + mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"]) graph = onnx.helper.make_graph( nodes=[mul_node], name="mul_graph", inputs=[inp0, inp1], outputs=[outp] ) diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f78dcea1a1ce364d0657ad64de7d440d41b822 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py @@ -0,0 +1,160 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from onnx import TensorProto, helper +import numpy as np +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.general import GiveUniqueNodeNames +from finn.custom_op.registry import getCustomOp +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.infer_shapes import InferShapes + + +def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt): + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim] + ) + + mp_node = helper.make_node( + "MaxPool", + ["inp"], + ["outp"], + kernel_shape=[k, k], + pads=[pad, pad, pad, pad], + strides=[stride, stride], + ) + graph = helper.make_graph( + nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="mp-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model = model.transform(InferShapes()) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4]) +# pool configuration: ( k,stride, pad, ifm_dim ) +@pytest.mark.parametrize( + "pool_config", [(3, 2, 0, 5), (3, 2, 1, 5), (2, 2, 0, 8), (5, 2, 2, 7)] +) +# input channels +@pytest.mark.parametrize("ifm_ch", [1, 4, 20]) +# number of out channel computed in parallel +@pytest.mark.parametrize("pe", [1, 4, 20]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# pool type +@pytest.mark.parametrize("op_type", ["MaxPool"]) +@pytest.mark.slow +@pytest.mark.vivado +def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_type): + k, stride, pad, ifm_dim = pool_config + + if ifm_ch % pe != 0: + pytest.skip("ifm_ch%pe != 0. Skipping") + + if pad != 0 and idt.signed(): + pytest.skip("No support for pal_val != 0. Skipping") + + np.random.seed(0) + ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1) + + x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim)) + # prepare input data + input_dict = prepare_inputs(x) + if op_type == "MaxPool": + model = make_single_maxpool_modelwrapper( + k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt + ) + else: + assert False, "{} is not a supported op_type".format(op_type) + + y_expected = oxe.execute_onnx(model, input_dict)["outp"] + + new_model = model.transform(to_hls.InferPool_Batch()) + new_model = new_model.transform(GiveUniqueNodeNames()) + + if ifm_ch != pe: + new_model = new_model.transform(to_hls.InferConvInpGen()) + # Folding + for n in new_model.graph.node: + if n.op_type == "ConvolutionInputGenerator": + inst = getCustomOp(n) + inst.set_nodeattr("SIMD", pe) + elif n.op_type == "Pool_Batch": + inst = getCustomOp(n) + inst.set_nodeattr("PE", pe) + + if exec_mode == "cppsim": + new_model = new_model.transform(SetExecMode("cppsim")) + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # execute new_model + y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] + assert (y_produced == y_expected).all() + if stride != k: + if pad == 0 or ifm_ch == pe: + assert len(new_model.graph.node) == 4 + else: + assert len(new_model.graph.node) == 5 + else: + assert len(new_model.graph.node) == 1 diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 9d6390b2673e5d2c0e72748183ac04ed222d078e..5ff3da87228a2a32a41226bb46e0b16b1a44df50 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -23,7 +23,7 @@ test_fpga_part = pynq_part_map[test_pynq_board] target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style): assert pad_style == 2, "only pad_style == 2 supported in hlslib" assert padding > 0, "Output dim should be greater than input dim" odim = idim + padding @@ -47,6 +47,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): inputDataType=str(idt.name), PaddingStyle=pad_style, numInputVectors=1, + SIMD=simd, ) graph = helper.make_graph( @@ -63,11 +64,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): # input image dimension -@pytest.mark.parametrize("idim", [8, 16]) +@pytest.mark.parametrize("idim", [8]) # number of rows and number of cols to add @pytest.mark.parametrize("pad", [2, 3]) # number of channels -@pytest.mark.parametrize("num_ch", [1, 2]) +@pytest.mark.parametrize("num_ch", [2, 4]) +# Input parallelism +@pytest.mark.parametrize("simd", [1, 2]) # PaddingStyle: selects behavior when (odim-idim)%2 != 0 @pytest.mark.parametrize("pad_style", [2]) # FINN input datatype @@ -76,14 +79,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode): - +def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): + if num_ch % simd != 0: + pytest.skip(" num_ch % simd != 0, skipping") # generate input data x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) input_dict = {"inp": x} odim = idim + pad - model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style) + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/transformation/test_absorb_mul_into_topk.py b/tests/transformation/test_absorb_mul_into_topk.py new file mode 100644 index 0000000000000000000000000000000000000000..1394220f7c336ccea8fe9c494734c4175bf2e847 --- /dev/null +++ b/tests/transformation/test_absorb_mul_into_topk.py @@ -0,0 +1,108 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.transformation.insert_topk import InsertTopK +from finn.transformation.streamline.absorb import AbsorbScalarMulIntoTopK +import finn.core.onnx_exec as oxe + +# parameter to indicate if mul parameter is negative or positive +@pytest.mark.parametrize("mul_positive", [True, False]) +# parameter to indicate if mul parameter is scalar or not +@pytest.mark.parametrize("scalar", [True, False]) +def test_absorb_mul_into_topk(mul_positive, scalar): + if scalar is True: + shape = [1] + else: + shape = [1, 1, 1, 1000] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 1, 1, 1000]) + a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, 1000]) + + mul_node = helper.make_node("Mul", ["inp", "a0"], ["outp"]) + mul_graph = helper.make_graph( + nodes=[mul_node], + name="mul-graph", + inputs=[inp], + outputs=[outp], + value_info=[a0], + ) + + model = helper.make_model(mul_graph, producer_name="mul_model") + model = ModelWrapper(model) + # initialize values + if mul_positive is True: + a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype( + np.float32 + ) + else: + a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype( + np.float32 + ) + model.set_initializer("a0", a0_values) + model = model.transform(InsertTopK()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model_transformed = model.transform(AbsorbScalarMulIntoTopK()) + + # compare execution results + inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype( + np.float32 + ) + idict = {"global_in": inp_values} + odict = oxe.execute_onnx(model, idict, True) + y_indices = odict["global_out"] + y_values = odict["TopK_0_out0"] + odict = oxe.execute_onnx(model_transformed, idict, True) + y_tr_indices = odict["global_out"] + y_tr_values = odict["TopK_0_out0"] + + # the indices stay the same, if the model is transformed or not + assert (y_indices == y_tr_indices).all() + + if scalar is True and mul_positive is True: + # the values change if the model was transformed + assert (y_values != y_tr_values).all() + + # check for new order + assert model.graph != model_transformed.graph + assert len(model.graph.node) - 1 == len(model_transformed.graph.node) + assert model_transformed.graph.node[0].op_type == "TopK" + + else: + assert (y_values == y_tr_values).all() + assert model.graph == model_transformed.graph diff --git a/tests/transformation/test_change_datalayout.py b/tests/transformation/test_change_datalayout.py new file mode 100644 index 0000000000000000000000000000000000000000..66459d574957575e61ec1bec631fb7030a27cca1 --- /dev/null +++ b/tests/transformation/test_change_datalayout.py @@ -0,0 +1,112 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest +from onnx import helper, TensorProto + +from finn.custom_op.maxpoolnhwc import compute_pool_output_dim +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +import finn.core.data_layout as DataLayout +from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.util.basic import gen_finn_dt_tensor +from finn.util.basic import get_by_name +import finn.core.onnx_exec as oxe + +# stride +@pytest.mark.parametrize("s", [1, 2]) +# kernel +@pytest.mark.parametrize("k", [3, 4]) +# ibits +@pytest.mark.parametrize("ibits", [4, 8]) +# obits +@pytest.mark.parametrize("obits", [2, 4]) +# signed +@pytest.mark.parametrize("signed", [False, True]) +# channels +@pytest.mark.parametrize("c", [2, 3]) +# input dimension +@pytest.mark.parametrize("idim", [6, 7]) +def test_change_datalayout_quantavgpool(s, k, ibits, obits, signed, c, idim): + n = 1 + odim = compute_pool_output_dim(idim, k, s) + # determine input FINN datatype + if signed is True: + prefix = "INT" + else: + prefix = "UINT" + dt_name = prefix + str(ibits) + dtype = DataType[dt_name] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [n, c, idim, idim]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [n, c, odim, odim]) + + node = helper.make_node( + "QuantAvgPool2d", + ["inp"], + ["outp"], + domain="finn", + stride=s, + kernel=k, + ibits=ibits, + obits=obits, + signed=signed, + data_layout="NCHW", + ) + graph = helper.make_graph( + nodes=[node], name="single-quantavgpool", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph) + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model_transformed = model.transform(ChangeDataLayoutQuantAvgPool2d()) + model_transformed = model_transformed.transform(InferShapes()) + model_transformed = model_transformed.transform(InferDataTypes()) + model_transformed = model_transformed.transform(InferDataLayouts()) + model_transformed = model_transformed.transform(GiveUniqueNodeNames()) + model_transformed = model_transformed.transform(GiveReadableTensorNames()) + inp_values = gen_finn_dt_tensor(dtype, [n, c, idim, idim]) + idict = {"inp": inp_values} + assert oxe.compare_execution(model, model_transformed, idict) + assert len(model.graph.node) + 2 == len(model_transformed.graph.node) + assert model_transformed.graph.node[-1].op_type == "Transpose" + assert model_transformed.graph.node[0].op_type == "Transpose" + # check if QuantAvgPool2d node has datalayout set correctly + node = model_transformed.graph.node[1] + d_layout = get_by_name(node.attribute, "data_layout").s.decode("UTF-8") + assert d_layout == "NHWC" + assert model_transformed.get_tensor_layout(node.input[0]) == DataLayout.NHWC + assert model_transformed.get_tensor_layout(node.output[0]) == DataLayout.NHWC diff --git a/tests/transformation/test_move_mul_past_dw_conv.py b/tests/transformation/test_move_mul_past_dw_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..1ae8fbfe89986d58d3d71f5f8735a98469d9d1e3 --- /dev/null +++ b/tests/transformation/test_move_mul_past_dw_conv.py @@ -0,0 +1,93 @@ +import pytest + +from onnx import helper, TensorProto +from finn.custom_op.im2col import compute_conv_output_dim +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.streamline.reorder import MoveMulPastDWConv + + +# input dimension +@pytest.mark.parametrize("ifm_dim", [4, 7]) +# input channels +@pytest.mark.parametrize("ifm_ch", [2, 3]) +# kernel size +@pytest.mark.parametrize("k", [2, 3]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# padding +@pytest.mark.parametrize("pad_amt", [0, 1]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw): + if dw == 1: + ofm_ch = ifm_ch + groups = ifm_ch + W_shape = [ofm_ch, 1, k, k] + else: + ofm_ch = ifm_ch + 2 + groups = 1 + W_shape = [ofm_ch, ifm_ch, k, k] + + ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad_amt) + + # set up onnx model + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [1, ifm_ch, 1, 1]) + W = helper.make_tensor_value_info("W", TensorProto.FLOAT, W_shape) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim] + ) + + Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) + + Conv_node = helper.make_node( + "Conv", + ["mul_out", "W"], + ["outp"], + group=groups, + kernel_shape=[k, k], + pads=[pad_amt, pad_amt, pad_amt, pad_amt], + strides=[stride, stride], + ) + + graph = helper.make_graph( + nodes=[Mul_node, Conv_node], + name="mulpastconv_graph", + inputs=[inp], + outputs=[outp], + value_info=[mul, W], + ) + + model = helper.make_model(graph, producer_name="mulpastconv-model") + model = ModelWrapper(model) + inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim]) + mul_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, 1, 1]) + W_values = gen_finn_dt_tensor(DataType.INT2, W_shape) + model.set_initializer("W", W_values) + model.set_initializer("mul", mul_values) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + idict = {"inp": inp_values} + odict = oxe.execute_onnx(model, idict, True) + out_before = odict["outp"] + + # move channelwise multiplication past depthwise conv + model_transformed = model.transform(MoveMulPastDWConv()) + odict = oxe.execute_onnx(model_transformed, idict, True) + out_after = odict["outp"] + + assert (out_before == out_after).all() + + if dw == 0: + assert model.graph.node[0].op_type == model_transformed.graph.node[0].op_type + assert model.graph.node[1].op_type == model_transformed.graph.node[1].op_type + else: + assert model.graph.node[0].op_type == model_transformed.graph.node[1].op_type + assert model.graph.node[1].op_type == model_transformed.graph.node[0].op_type