diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 65513f3148e0fed2583d02e1eba249bc9a1f2f6e..80e5261c9b6568e0f340ae0add86d269c036ff16 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -13,9 +13,9 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -BREVITAS_COMMIT=7696326e5f279cacffd5b6ac8d9e8d81deec3978 +BREVITAS_COMMIT=026a509186b7e7b0b65d46a2f905043d41069306 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716 +HLSLIB_COMMIT=8aed899c278c36c977a249558d71795086cf852c PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 95594bb67a2be3a4c3fbba488c75a704f623c136..5beabeb2980840fd1dbd2ee5b058738fa8553152 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -18,6 +18,7 @@ Requirements * A working Vivado 2019.1 installation * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located) * (optional) A PYNQ board with a network connection + * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring`` Running FINN in Docker ====================== diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst index 010cdece978cde078c3df4c64177fa1c5455aa0a..dee62f09a9253380e05300dac8fa34915c20dab5 100644 --- a/docs/finn/internals.rst +++ b/docs/finn/internals.rst @@ -18,6 +18,7 @@ ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we Note that FINN uses floating point tensors as a carrier data type to represent integers. Floating point arithmetic can introduce rounding errors, e.g. (int_num * float_scale) / float_scale is not always equal to int_num. When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`finn.util.basic.sanitize_quant_values`) for more information. +This behavior can be disabled (not recommended!) by setting the environment variable SANITIZE_QUANT_TENSORS=0. Custom Operations/Nodes ======================= diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index 218df22e07537034b377abc077aa7902bc0c4cfc..efdfaa19d9f9e5dfa41911a2184e989337b3d9c2 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -39,7 +39,7 @@ from finn.core.remote_exec import remote_exec from finn.core.rtlsim_exec import rtlsim_exec from finn.custom_op.registry import getCustomOp import finn.analysis.topology as ta -from finn.util.basic import sanitize_quant_values +from finn.util.basic import sanitize_quant_values, get_sanitize_quant_tensors def execute_node(node, context, graph): @@ -160,14 +160,17 @@ def execute_onnx(model, input_dict, return_full_exec_context=False): # we can simply walk down the list since the ONNX spec guarantees that it is # topologically sorted for node in graph.node: - # call util function match input values to quantization annotation - execution_context = sanitize_quant_values( - model, node.input, execution_context - ) + if get_sanitize_quant_tensors() != 0: + # round input values to match quantization annotation + execution_context = sanitize_quant_values( + model, node.input, execution_context + ) execute_node(node, execution_context, graph) - execution_context = sanitize_quant_values( - model, node.output, execution_context - ) + if get_sanitize_quant_tensors() != 0: + # round output values to quantization annotation + execution_context = sanitize_quant_values( + model, node.output, execution_context + ) elif model_exec_mode == "remote_pynq": # use remote exec metadata built into model to execute on a remote PYNQ remote_exec(model, execution_context) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py similarity index 88% rename from src/finn/custom_op/fpgadataflow/fmpadding.py rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..d326ae7dfc7830a0081c3b13233d67ef08b12eff 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp): "Padding": ("i", True, 2), # number of channels in input image "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), # FINN input datatype "inputDataType": ("s", True, ""), # controls distribution of padded pixels @@ -55,20 +57,22 @@ class FMPadding_Batch(HLSCustomOp): return oshape def get_folded_input_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_input_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) def get_folded_output_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_output_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() @@ -114,15 +118,13 @@ class FMPadding_Batch(HLSCustomOp): def get_instream_width(self): ibits = self.get_input_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return ibits * num_ch + simd = self.get_nodeattr("SIMD") + return ibits * simd def get_outstream_width(self): obits = self.get_output_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return obits * num_ch + simd = self.get_nodeattr("SIMD") + return obits * simd def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() @@ -135,13 +137,15 @@ class FMPadding_Batch(HLSCustomOp): self.code_gen_dict["$DEFINES$"] = [ """#define ImgDim1 {}\n#define OutputDim1 {}\n #define Padding1 {}\n#define NumChannels1 {}\n - #define PaddingStyle1 {}\n#define numReps {}\n""".format( + #define PaddingStyle1 {}\n#define numReps {} + #define SIMD1 {}\n""".format( self.get_nodeattr("ImgDim"), self.get_padded_odim(), self.get_nodeattr("Padding"), self.get_nodeattr("NumChannels"), self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), + self.get_nodeattr("SIMD"), ) ] @@ -176,7 +180,7 @@ class FMPadding_Batch(HLSCustomOp): in_t = self.get_input_datatype().get_hls_datatype_str() node = self.onnx_node self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, OutputDim1, Padding1, NumChannels1, + """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format( node.op_type, in_t ) @@ -232,6 +236,7 @@ class FMPadding_Batch(HLSCustomOp): node = self.onnx_node exp_ishape = self.get_normal_input_shape() exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() folded_oshape = self.get_folded_output_shape() if mode == "cppsim": @@ -254,10 +259,8 @@ class FMPadding_Batch(HLSCustomOp): match expected shape (1, ImgDim, ImgDim, NumChannels).""" export_idt = self.get_input_datatype() - # no reshaping for input since assuming no folding on input - # make copy before saving array - inp = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), inp) + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) if mode == "cppsim": # execute the precompiled model diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 9b73ba1e100aa83fd19aa8799195c99891fca3fd..b6c992e4b5ea1ced088928b3d9a4db381d82db22 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -290,12 +290,15 @@ class StreamingFCLayer_Batch(HLSCustomOp): return out_width def get_weightstream_width(self): - """Returns weight stream width. Used in decoupled mode.""" - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wp = self.get_weight_datatype().bitwidth() - w_width = pe * simd * wp - return w_width + """Returns weight stream width. Used only in decoupled mode.""" + if self.get_nodeattr("mem_mode") == "decoupled": + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * simd * wp + return w_width + else: + return 0 def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 1a8216f64bf71b7fb9f1f8becf4732970b5bf451..1da60a5124fa86b4336bae8fd1a587672f2f2e6f 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -99,6 +99,7 @@ set_top $config_toplevelfxn open_solution sol1 set_part $config_proj_part +config_compile -ignore_long_run_time -disable_unroll_code_size_check config_interface -m_axi_addr64 config_rtl -auto_prefix $EXTRA_DIRECTIVES$ diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 25ea05e3607a52731ae1b64de421837bf137ee2b..17ba44b959577faf573d77ae222f7b2a3be6669d 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -30,20 +30,30 @@ from finn.custom_op.fpgadataflow import HLSCustomOp class TLastMarker(HLSCustomOp): - """Class that corresponds to the TLastMarker node that needs to be - inserted at the end of the model for rtlsim with stitched IP. - It marks the end of the current image/input sample.""" + """Node that adds/removes AXI stream TLAST signals where needed. Its behavior + is transparent in node-by-node execution, only visible in IP-stitched rtlsim or + actual hardware. + This node may be needed at the end of the network to signal a DMA write (needed by the + FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read.""" def __init__(self, onnx_node): super().__init__(onnx_node) def get_nodeattr_types(self): my_attrs = { + # number of (static) iterations until TLAST=1 is generated for Direction=out "NumIters": ("i", True, 0), + # whether static or dynamic (from AXI lite) number of iterations are used + "DynIters": ("i", False, 1), + # direction: whether to insert or remove TLAST + "Direction": ("s", False, "out"), # width of input-output data streams, in bits "StreamWidth": ("i", True, 0), # width of individual element in stream, in bits "ElemWidth": ("i", True, 0), + # Protocol: external or internal + # Vitis docs recommend using qdma_axis for external, ap_axiu for internal + "Protocol": ("s", False, "external"), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -76,12 +86,33 @@ class TLastMarker(HLSCustomOp): def defines(self, var): stream_width = self.get_nodeattr("StreamWidth") + direction = self.get_nodeattr("Direction") + protocol = self.get_nodeattr("Protocol") # output stream must have TLAST, so we use this stream data type: # qdma_axis<stream_data_width,0,0,0 > - out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + if direction == "out": + if protocol == "external": + out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + out_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + in_stream_dtype = "ap_uint<%d>" % stream_width + elif direction == "in": + out_stream_dtype = "ap_uint<%d>" % stream_width + if protocol == "external": + in_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + in_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + else: + raise Exception("Unrecognized Direction in TLastMarker") + self.code_gen_dict["$DEFINES$"] = [ "#define StreamWidth %d" % stream_width, "#define OutDType %s" % out_stream_dtype, + "#define InDType %s" % in_stream_dtype, "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"), ] @@ -89,27 +120,60 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$READNPYDATA$"] = [] def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - "unsigned int n = 1;", - "OutDType t;", - "t.set_keep(-1);", - "io_section: { // start of cycle accurate region", - "#pragma HLS protocol fixed", - "// do a first read from stream before we decide on numIters", - "// giving software a chance to set up the numIters prior to startup", - "t.set_data(in0.read());", - "n = (numIters == 0 ? NumItersPerImg : numIters);", - "t.set_last(n==1);", - "out.write(t);", - "} // end of cycle accurate region", - "// do one less iteration than spec since we already did one", - "for(unsigned int i=1; i<n; i++) {", - "#pragma HLS PIPELINE II=1", - "t.set_data(in0.read());", - "t.set_last(i==(n-1));", - "out.write(t);", - "}", - ] + dyn_iters = self.get_nodeattr("DynIters") + direction = self.get_nodeattr("Direction") + use_qdma_axis = self.get_nodeattr("Protocol") == "external" + if direction == "in": + # read from input and just pass data along; ignore tlast + # no dyn iters on input, it doesnt make sense + self.code_gen_dict["$DOCOMPUTE$"] = [ + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "out.write(in0.read().get_data());" + if use_qdma_axis + else "out.write(in0.read().data);", + "}", + ] + + elif dyn_iters == 1: + # output, with dynamic iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "io_section: { // start of cycle accurate region", + "#pragma HLS protocol fixed", + "// do a first read from stream before we decide on numIters", + "// giving software a chance to set up the numIters prior to startup", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "n = (numIters == 0 ? NumItersPerImg : numIters);", + "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);", + "out.write(t);", + "} // end of cycle accurate region", + "// do one less iteration than spec since we already did one", + "for(unsigned int i=1; i<n; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));", + "out.write(t);", + "}", + ] + + else: + # output, with static iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(NumItersPerImg-1));" + if use_qdma_axis + else "t.last = (i==(NumItersPerImg-1));", + "out.write(t);", + "}", + ] def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = [] @@ -118,18 +182,30 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void %s(hls::stream<ap_uint<StreamWidth> > &in0, - hls::stream<OutDType> &out, unsigned int numIters)""" - % self.onnx_node.name - ] + dyn_iters = self.get_nodeattr("DynIters") + + if dyn_iters == 1: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, + hls::stream<OutDType> &out, unsigned int numIters)""" + % self.onnx_node.name + ] + else: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)""" + % self.onnx_node.name + ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" - ) + + dyn_iters = self.get_nodeattr("DynIters") + if dyn_iters == 1: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" + ) + self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) @@ -158,7 +234,7 @@ class TLastMarker(HLSCustomOp): def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream<InDType> in0 ("in0");' ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<OutDType> out ("out");' diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py new file mode 100644 index 0000000000000000000000000000000000000000..3bc328a9f4f6670041d33491d58af6c553bafac9 --- /dev/null +++ b/src/finn/custom_op/quantavgpool2d.py @@ -0,0 +1,83 @@ +import numpy as np +from onnx import TensorProto, helper +import onnxruntime as rt + +from finn.custom_op import CustomOp +from finn.core.datatype import DataType + + +class QuantAvgPool2d(CustomOp): + """Class that corresponds to the quantized average pooling + layer from brevitas""" + + def get_nodeattr_types(self): + return { + "stride": ("i", True, 1), + "kernel": ("i", True, 1), + "ibits": ("i", True, 1), + "obits": ("i", True, 1), + "signed": ("i", True, 0), + } + + def make_shape_compatible_op(self, model): + node = self.onnx_node + k = self.get_nodeattr("kernel") + s = self.get_nodeattr("stride") + return helper.make_node( + "AveragePool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=[k, k], + strides=[s, s], + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + bw = self.get_nodeattr("obits") + if bw in [2, 4, 8, 16, 32]: + if self.get_nodeattr("signed") == 0: + dtype = DataType["UINT%d" % bw] + else: + dtype = DataType["INT%d" % bw] + else: + raise Exception("Unsupported output datatype for QuantAvgPool2d") + model.set_tensor_datatype(node.output[0], dtype) + + def execute_node(self, context, graph): + # create a standard average pooling node to help calculate the result + node = self.onnx_node + k = self.get_nodeattr("kernel") + s = self.get_nodeattr("stride") + ishape = context[node.input[0]].shape + oshape = context[node.output[0]].shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_avgpool = helper.make_node( + "AveragePool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=[k, k], + strides=[s, s], + ) + graph_avgpool = helper.make_graph( + nodes=[node_avgpool], + name="single-avgpool-exec", + inputs=[inp], + outputs=[outp], + ) + model_avgpool = helper.make_model(graph_avgpool) + idict = {node.input[0]: context[node.input[0]]} + sess = rt.InferenceSession(model_avgpool.SerializeToString()) + result_temp = sess.run(None, idict) + # remove scaling introduced by average + result_temp = result_temp[0] * (k * k) + ibits = self.get_nodeattr("ibits") + max_value = 2 ** ibits - 1 + max_value = max_value * k * k + max_bit_width = int(max_value).bit_length() + shift_bits = max_bit_width - self.get_nodeattr("obits") + result = np.right_shift(result_temp.astype(int), shift_bits) + context[node.output[0]] = result.astype(np.float32) + + def verify_node(self): + pass diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 614a3d7ffd70d0b102bad2b76177a2d3b32765c7..3bfdebed58b3995cfeb6e97ed2836d9ee6a0da79 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -44,10 +44,11 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch -from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch +from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch +from finn.custom_op.quantavgpool2d import QuantAvgPool2d from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch # create a mapping of all known CustomOp names and classes @@ -69,6 +70,7 @@ custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch +custom_op["QuantAvgPool2d"] = QuantAvgPool2d custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 32f32ece585a93465ba32fede45d5eb606a2b0a3..04dd437af27b9fbe18b2255c20a8e4acda03b3d0 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -31,23 +31,34 @@ from onnx import helper as oh from finn.custom_op.registry import getCustomOp from finn.transformation import Transformation +from finn.util.basic import get_by_name + +import numpy as np class InsertTLastMarker(Transformation): - """Ensure that the graph is terminated with a TLastMarker node, inserting - one if necessary.""" + """Ensure that the graph is started/terminated with a TLastMarker node, inserting + one if necessary. Use constructor args to determine type of TLastMarker to be inserted. + More information available on the TLastMarker documentation. + """ - def __init__(self): + def __init__(self, both=False, external=True, dynamic=True): super().__init__() + self.dyniters = dynamic + self.external = external + self.both = both def apply(self, model): # TODO only makes sense for a pure fpgadataflow graph -- check! graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) - if final_node.op_type == "TLastMarker": - # TODO maybe check the correctness of properties - return (model, False) - else: + graph_modified = False + if final_node.op_type != "TLastMarker" and not ( + final_node.op_type == "IODMA" + and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") + == "out" + ): + custom_op = getCustomOp(final_node) num_iters = int(custom_op.get_number_output_values()) stream_width = int(custom_op.get_outstream_width()) @@ -69,8 +80,51 @@ class InsertTLastMarker(Transformation): NumIters=num_iters, StreamWidth=stream_width, ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="out", + Protocol=("external" if self.external else "internal"), domain="finn", backend="fpgadataflow", ) model.graph.node.append(tlast_node) - return (model, True) + graph_modified = True + # if both is True, also insert marker on input + if self.both: + graph_in_name = model.graph.input[0].name + first_node = model.find_consumer(graph_in_name) + if first_node.op_type != "TLastMarker" and not ( + first_node.op_type == "IODMA" + and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") + == "in" + ): + + custom_op = getCustomOp(first_node) + num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) + stream_width = int(custom_op.get_instream_width()) + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + elem_width = in_dtype.bitwidth() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + # reroute final node output to first_node_in_name + first_node.input[0] = first_node_in.name + tlast_node = oh.make_node( + "TLastMarker", + [graph_in_name], + [first_node_in.name], + NumIters=num_iters, + StreamWidth=stream_width, + ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="in", + Protocol=("external" if self.external else "internal"), + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(0, tlast_node) + graph_modified = True + return (model, graph_modified) diff --git a/src/finn/transformation/infer_datatypes.py b/src/finn/transformation/infer_datatypes.py index 1acd4e3abe2d77248810cf15c15475e806a3bd32..39b7a787be8c725e7b6d474757dd96fc4848dfe0 100644 --- a/src/finn/transformation/infer_datatypes.py +++ b/src/finn/transformation/infer_datatypes.py @@ -71,7 +71,13 @@ def _infer_node_datatype(model, node): else: # unknown, assume node produces float32 outputs for o in node.output: - model.set_tensor_datatype(o, DataType.FLOAT32) + # check if output datatype is already set to a value != FLOAT32 + odtype = model.get_tensor_datatype(o) + if odtype is not None and odtype != DataType.FLOAT32: + # don't change data type + model.set_tensor_datatype(o, odtype) + else: + model.set_tensor_datatype(o, DataType.FLOAT32) # compare old and new output dtypes to see if anything changed new_odtypes = list(map(lambda x: model.get_tensor_datatype(x), node.output)) graph_modified = new_odtypes != odtypes diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py index 67824ad4f633983b93e3178d03118927a1ddd85b..769bed841ce07c1c9c62f762de4b2c0937a6d68f 100644 --- a/src/finn/transformation/streamline/collapse_repeated.py +++ b/src/finn/transformation/streamline/collapse_repeated.py @@ -30,6 +30,7 @@ from onnx import helper as oh from finn.transformation import Transformation from finn.transformation.infer_shapes import InferShapes +from finn.core.datatype import DataType class CollapseRepeatedOp(Transformation): @@ -83,6 +84,9 @@ class CollapseRepeatedOp(Transformation): graph.node.insert(node_ind, new_node) # replace parameter value model.set_initializer(new_node_param_name, new_param) + # be conservative with param/output DataTypes + model.set_tensor_datatype(new_node_param_name, DataType.FLOAT32) + model.set_tensor_datatype(end_name, DataType.FLOAT32) # remove old nodes graph.node.remove(n) graph.node.remove(consumer) diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc4233ddafbc70c4d20d316ea72ea6bba1b82a8 --- /dev/null +++ b/src/finn/transformation/streamline/remove.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from finn.transformation import Transformation +from finn.transformation.infer_shapes import InferShapes +import numpy as np + +class RemoveIdentityOps(Transformation): + """Remove identity ops like Add/Sub with zero or Mul/Div with one""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type in ["Add", "Sub"] + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + A = model.get_initializer(n.input[1]) + if A is not None and (A == np.zeros_like(A)).all(): + producer = model.find_producer(n.input[0]) + # remove node and wire output tensor to + # output of producer node + producer.output[0] = n.output[0] + graph.node.remove(n) + + elif ( + n.op_type in ["Mul", "Div"] + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + A = model.get_initializer(n.input[1]) + if A is not None and (A == np.ones_like(A)).all(): + producer = model.find_producer(n.input[0]) + # remove node and wire output tensor to + # output of producer node + producer.output[0] = n.output[0] + graph.node.remove(n) + model = model.transform(InferShapes()) + return (model, graph_modified) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 585d6b90a59ca9f3dac56a6133de705c2f56f025..4a8277e08d3fc21e0b20668edf2ecad947b36647 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -106,6 +106,25 @@ def get_finn_root(): ) +def get_execution_error_thresh(): + "Return the max error that is allowed for rounding in FINN execution." + try: + return float(os.environ["ERROR_THRESH"]) + except KeyError: + return 1e-2 + + +def get_sanitize_quant_tensors(): + """Return whether tensors with quantization annotations should be sanitized. + Enabled by default, disabling will yield faster ONNX execution but may give + incorrect results. Use with caution.""" + try: + return int(os.environ["SANITIZE_QUANT_TENSORS"]) + except KeyError: + # enabled by default + return 1 + + def make_build_dir(prefix=""): """Creates a temporary folder with given prefix to be used as a build dir. Use this function instead of tempfile.mkdtemp to ensure any generated files @@ -305,7 +324,7 @@ def sanitize_quant_values(model, node_tensors, execution_context, check_values=F ) # check if rounded values are not too far from original values max_error = max(np.abs(current_values - updated_values).flatten()) - if max_error <= 1e-4: + if max_error <= get_execution_error_thresh(): if check_values is True: # check again if values can now be represented with set finn datatype # TODO: vectorize with numpy diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py new file mode 100644 index 0000000000000000000000000000000000000000..24854a2153df9af78feb8352ca119e831a9ac9eb --- /dev/null +++ b/tests/brevitas/test_brevitas_avg_pool_export.py @@ -0,0 +1,103 @@ +import os + +import onnx # noqa +import torch +import numpy as np +import brevitas.onnx as bo +from brevitas.nn import QuantAvgPool2d +from brevitas.quant_tensor import pack_quant_tensor +from brevitas.core.quant import QuantType +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.util.basic import gen_finn_dt_tensor +import finn.core.onnx_exec as oxe + +import pytest + +export_onnx_path = "test_avg_pool.onnx" + + +@pytest.mark.parametrize("kernel_size", [2, 3]) +@pytest.mark.parametrize("stride", [1, 2]) +@pytest.mark.parametrize("signed", [False, True]) +@pytest.mark.parametrize("bit_width", [2, 4]) +@pytest.mark.parametrize("input_bit_width", [4, 8, 32]) +@pytest.mark.parametrize("channels", [2, 4]) +@pytest.mark.parametrize("idim", [7, 8]) +def test_brevitas_avg_pool_export( + kernel_size, stride, signed, bit_width, input_bit_width, channels, idim +): + ishape = (1, channels, idim, idim) + ibw_tensor = torch.Tensor([input_bit_width]) + + b_avgpool = QuantAvgPool2d( + kernel_size=kernel_size, + stride=stride, + signed=signed, + min_overall_bit_width=bit_width, + max_overall_bit_width=bit_width, + quant_type=QuantType.INT, + ) + # call forward pass manually once to cache scale factor and bitwidth + input_tensor = torch.from_numpy(np.zeros(ishape)).float() + scale = np.ones((1, channels, 1, 1)) + output_scale = torch.from_numpy(scale).float() + input_quant_tensor = pack_quant_tensor( + tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor + ) + bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor) + model = ModelWrapper(export_onnx_path) + + # determine input FINN datatype + if signed is True: + prefix = "INT" + else: + prefix = "UINT" + dt_name = prefix + str(input_bit_width // 2) + dtype = DataType[dt_name] + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + # execution with input tensor using integers and scale = 1 + # calculate golden output + inp = gen_finn_dt_tensor(dtype, ishape) + input_tensor = torch.from_numpy(inp).float() + input_quant_tensor = pack_quant_tensor( + tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor + ) + b_avgpool.eval() + expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy() + + # finn execution + idict = {model.graph.input[0].name: inp} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + assert (expected == produced).all() + + # execution with input tensor using float and scale != 1 + scale = np.random.uniform(low=0, high=1, size=(1, channels, 1, 1)).astype( + np.float32 + ) + inp_tensor = inp * scale + input_tensor = torch.from_numpy(inp_tensor).float() + input_scale = torch.from_numpy(scale).float() + input_quant_tensor = pack_quant_tensor( + tensor=input_tensor, scale=input_scale, bit_width=ibw_tensor + ) + # export again to set the scale values correctly + bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor) + model = ModelWrapper(export_onnx_path) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + b_avgpool.eval() + expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy() + # finn execution + idict = {model.graph.input[0].name: inp_tensor} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + + assert np.isclose(expected, produced).all() + + os.remove(export_onnx_path) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 9d6390b2673e5d2c0e72748183ac04ed222d078e..5ff3da87228a2a32a41226bb46e0b16b1a44df50 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -23,7 +23,7 @@ test_fpga_part = pynq_part_map[test_pynq_board] target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style): assert pad_style == 2, "only pad_style == 2 supported in hlslib" assert padding > 0, "Output dim should be greater than input dim" odim = idim + padding @@ -47,6 +47,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): inputDataType=str(idt.name), PaddingStyle=pad_style, numInputVectors=1, + SIMD=simd, ) graph = helper.make_graph( @@ -63,11 +64,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): # input image dimension -@pytest.mark.parametrize("idim", [8, 16]) +@pytest.mark.parametrize("idim", [8]) # number of rows and number of cols to add @pytest.mark.parametrize("pad", [2, 3]) # number of channels -@pytest.mark.parametrize("num_ch", [1, 2]) +@pytest.mark.parametrize("num_ch", [2, 4]) +# Input parallelism +@pytest.mark.parametrize("simd", [1, 2]) # PaddingStyle: selects behavior when (odim-idim)%2 != 0 @pytest.mark.parametrize("pad_style", [2]) # FINN input datatype @@ -76,14 +79,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode): - +def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): + if num_ch % simd != 0: + pytest.skip(" num_ch % simd != 0, skipping") # generate input data x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) input_dict = {"inp": x} odim = idim + pad - model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style) + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/transformation/test_remove_identity_ops.py b/tests/transformation/test_remove_identity_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..536c1ab0b48fa44388da23f45b528da3c5f3b2f2 --- /dev/null +++ b/tests/transformation/test_remove_identity_ops.py @@ -0,0 +1,81 @@ +import pytest + +import numpy as np +from onnx import helper, TensorProto +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline.remove import RemoveIdentityOps +from finn.util.basic import gen_finn_dt_tensor + + +def insert_identity_op(model, op): + if op in ["Add", "Sub"]: + val = np.asarray([0.0], dtype=np.float32) + elif op in ["Mul", "Div"]: + val = np.asarray([1.0], dtype=np.float32) + else: + return + + identity_node = helper.make_node(op, ["div_out", "value"], ["ident_out"]) + graph = model.graph + graph.node.insert(3, identity_node) + graph.node[-1].input[0] = "ident_out" + model.set_initializer("value", val) + + return model + + +# identity operations to be inserted +@pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"]) +def test_remove_identity_ops(op): + + # set up onnx model + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1]) + mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, []) + shape = helper.make_tensor_value_info("shape", TensorProto.FLOAT, [2]) + div = helper.make_tensor_value_info("div", TensorProto.FLOAT, []) + matmul = helper.make_tensor_value_info("matmul", TensorProto.FLOAT, [4, 2]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 2]) + + mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) + reshape_node = helper.make_node("Reshape", ["mul_out", "shape"], ["reshape_out"]) + div_node = helper.make_node("Div", ["reshape_out", "div"], ["div_out"]) + matmul_node = helper.make_node("MatMul", ["div_out", "matmul"], ["outp"]) + + graph = helper.make_graph( + nodes=[mul_node, reshape_node, div_node, matmul_node], + name="identity-graph", + inputs=[inp], + outputs=[outp], + value_info=[mul, shape, div, matmul], + ) + + model = helper.make_model(graph, producer_name="mulpastconv-model") + model = ModelWrapper(model) + inp_values = gen_finn_dt_tensor(DataType.INT2, [1, 4, 1, 1]) + mul_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32) + shape_values = np.asarray([1, -1], dtype=np.int64) + div_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32) + matmul_values = gen_finn_dt_tensor(DataType.INT2, [4, 2]) + model.set_initializer("mul", mul_values) + model.set_initializer("shape", shape_values) + model.set_initializer("div", div_values) + model.set_initializer("matmul", matmul_values) + insert_identity_op(model, op) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + idict = {"inp": inp_values} + odict = oxe.execute_onnx(model, idict) + out_before = odict["outp"] + num_of_nodes_before = len(model.graph.node) + + model = model.transform(RemoveIdentityOps()) + num_of_nodes_after = len(model.graph.node) + assert num_of_nodes_before - 1 == num_of_nodes_after + + odict = oxe.execute_onnx(model, idict) + out_after = odict["outp"] + assert (out_before == out_after).all()