diff --git a/.gitignore b/.gitignore index f838c1695130d232ac6a2b888aed0cea31aafaa7..8b3166a44070a4575aac86c445c4504b594cda08 100644 --- a/.gitignore +++ b/.gitignore @@ -78,3 +78,6 @@ MANIFEST # Jenkins cfg dir /docker/jenkins_home + +# SSH key dir mounted into Docker +/ssh_keys/ diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev index 22e3eb623c7a5da19a5e3ae2284557577898ad23..0e12b504a26ccdb8fd78e162f04cfdeab5a186f1 100644 --- a/docker/Dockerfile.finn_dev +++ b/docker/Dockerfile.finn_dev @@ -42,7 +42,7 @@ WORKDIR /workspace RUN apt-get update RUN apt-get -y upgrade RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev -RUN apt-get install -y verilator nano zsh +RUN apt-get install -y verilator nano zsh rsync RUN apt-get -y install sshpass RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index e34b6ce9cc4488c806da8bcf3cc5cc8e500ae806..132d5bdaa286ba3e50bbd06971e9139f5859ef11 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -13,9 +13,9 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a +BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716 +HLSLIB_COMMIT=8aed899c278c36c977a249558d71795086cf852c PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada diff --git a/docs/finn/example_networks.rst b/docs/finn/example_networks.rst index 9f221871f09bf655db9d81988d6fa83e53473634..86bb2bd11fd805a23a3bdf6da8a8ed686259ecc1 100644 --- a/docs/finn/example_networks.rst +++ b/docs/finn/example_networks.rst @@ -20,17 +20,17 @@ version, this is indicated by an x mark in the table. +-----------------------+------------+----------+----------+----------+----------+----------+----------+ | Export/Import | x | x | x | x | x | x | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Streamlining | x | x | x | x | x | | | +| Streamlining | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Convert to HLS layers | x | x | x | x | x | | | +| Convert to HLS layers | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Stitched IP | x | x | x | x | x | | | +| Stitched IP | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Hardware test | x | x | x | | x | | | +| Hardware test | x | x | x | | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| cppsim | x | x | x | x | x | | | +| cppsim | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| rtlsim node-by-node | x | x | x | x | x | | | +| rtlsim node-by-node | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| rtlsim stitched IP | x | x | x | x | x | | | +| rtlsim stitched IP | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 95594bb67a2be3a4c3fbba488c75a704f623c136..f4fa7a13dcbe4fe8ab9667a111df00c605747710 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -18,6 +18,7 @@ Requirements * A working Vivado 2019.1 installation * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located) * (optional) A PYNQ board with a network connection + * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring`` Running FINN in Docker ====================== @@ -30,6 +31,7 @@ Getting an interactive shell for development or experimentation sh run_docker.sh Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation. +If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`. .. warning:: The Docker container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the /workspace/finn folder (which is mounted from the host computer) or otherwise backed up. diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst index 7a4bc687eeb827320991f7d3f1ef8cc35e97f3da..dee62f09a9253380e05300dac8fa34915c20dab5 100644 --- a/docs/finn/internals.rst +++ b/docs/finn/internals.rst @@ -16,6 +16,10 @@ Custom Quantization Annotations ONNX does not support datatypes smaller than 8-bit integers, whereas in FINN we are interested in smaller integers down to ternary and bipolar. To make this work, FINN uses the quantization_annotation field in ONNX to annotate tensors with their FINN DataType (:py:mod:`finn.core.datatype.DataType`) information. However, all tensors are expected to use single-precision floating point (float32) storage in FINN. This means we store even a 1-bit value as floating point for the purposes of representation. The FINN compiler flow is responsible for eventually producing a packed representation for the target hardware, where the 1-bit is actually stored as 1-bit. +Note that FINN uses floating point tensors as a carrier data type to represent integers. Floating point arithmetic can introduce rounding errors, e.g. (int_num * float_scale) / float_scale is not always equal to int_num. +When using the custom ONNX execution flow, FINN will attempt to sanitize any rounding errors for integer tensors. See (:py:mod:`finn.util.basic.sanitize_quant_values`) for more information. +This behavior can be disabled (not recommended!) by setting the environment variable SANITIZE_QUANT_TENSORS=0. + Custom Operations/Nodes ======================= diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst index 391c6f999312839daca0d4161336c7c0ae822f89..c52c0840aa40566d930164490b1fd249d7c07757 100644 --- a/docs/finn/verification.rst +++ b/docs/finn/verification.rst @@ -28,4 +28,15 @@ This simulation can be used for a model containing several HLS custom operations Emulation using PyVerilator =========================== -The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole design. For that purpose PyVerilator gets the generated verilog files. +The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. + +For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this: + - for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename. + - for IP-stitched rtlsim, set the `rtlsim_trace` metadata_prop for the graph as per above. + +To control the tracing depth in the module hierarchy, use the `RTLSIM_TRACE_DEPTH` environment variable (default is 1): + - level 1 shows top-level input/output streams + - level 2 shows per-layer input/output streams + - level 3 shows per full-layer I/O including FIFO count signals + +Note that deeper tracing will take longer to execute and may produce very large .vcd files. diff --git a/run-docker.sh b/run-docker.sh index e07556716db335421f57a390f1e6a17168ac058b..00ca8f86985a78d8f2af099c51dcd4b80cd2e974 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -65,6 +65,11 @@ DOCKER_INST_NAME="finn_dev_${DOCKER_UNAME}" # ensure Docker tag and inst. name are all lowercase DOCKER_TAG=$(echo "$DOCKER_TAG" | tr '[:upper:]' '[:lower:]') DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]') +# Absolute path to this script, e.g. /home/user/bin/foo.sh +SCRIPT=$(readlink -f "$0") +# Absolute path this script is in, thus /home/user/bin +SCRIPTPATH=$(dirname "$SCRIPT") + # the settings below will be taken from environment variables if available, # otherwise the defaults below will be used : ${JUPYTER_PORT=8888} @@ -74,11 +79,7 @@ DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]') : ${PYNQ_BOARD="Pynq-Z1"} : ${PYNQ_TARGET_DIR="/home/xilinx/$DOCKER_INST_NAME"} : ${NUM_DEFAULT_WORKERS=1} - -# Absolute path to this script, e.g. /home/user/bin/foo.sh -SCRIPT=$(readlink -f "$0") -# Absolute path this script is in, thus /home/user/bin -SCRIPTPATH=$(dirname "$SCRIPT") +: ${FINN_SSH_KEY_DIR="$SCRIPTPATH/ssh_keys"} BUILD_LOCAL=/tmp/$DOCKER_INST_NAME VIVADO_HLS_LOCAL=$VIVADO_PATH @@ -87,6 +88,7 @@ VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache # ensure build dir exists locally mkdir -p $BUILD_LOCAL mkdir -p $VIVADO_IP_CACHE +mkdir -p $FINN_SSH_KEY_DIR gecho "Instance is named as $DOCKER_INST_NAME" gecho "Mounting $BUILD_LOCAL into $BUILD_LOCAL" @@ -133,6 +135,7 @@ docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \ -v $SCRIPTPATH:/workspace/finn \ -v $BUILD_LOCAL:$BUILD_LOCAL \ -v $VIVADO_PATH:$VIVADO_PATH \ +-v $FINN_SSH_KEY_DIR:/home/$DOCKER_UNAME/.ssh \ -e VIVADO_PATH=$VIVADO_PATH \ -e FINN_INST_NAME=$DOCKER_INST_NAME \ -e FINN_ROOT="/workspace/finn" \ diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index c2f68a35076418e0cf2edb578bdb8d548772fc78..7c3123cd5eb29a54dc5cbfb912225ad3fdb0f219 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -39,6 +39,7 @@ from finn.core.remote_exec import remote_exec from finn.core.rtlsim_exec import rtlsim_exec from finn.custom_op.registry import getCustomOp import finn.analysis.topology as ta +from finn.util.basic import sanitize_quant_values, get_sanitize_quant_tensors def execute_node(node, context, graph): @@ -102,15 +103,14 @@ def execute_node(node, context, graph): raise Exception( """Output shapes disagree after node execution: found %s vs expected %s""" - % ( - str(output_list[list_ind].shape), - str(context[outp].shape), - ) + % (str(output_list[list_ind].shape), str(context[outp].shape)) ) context[outp] = output_list[list_ind] -def execute_onnx(model, input_dict, return_full_exec_context=False): +def execute_onnx( + model, input_dict, return_full_exec_context=False, start_node=None, end_node=None +): """Executes given ONNX ModelWrapper with given named inputs. If return_full_exec_context is False, a dict of named outputs is returned @@ -118,7 +118,12 @@ def execute_onnx(model, input_dict, return_full_exec_context=False): If return return_full_exec_context is True, the full set of tensors used by the execution (including inputs, weights, activations and final outputs) - will be returned as a dict.""" + will be returned as a dict. + + When start_node and end_node are set to None, the whole graph is executed. + If they are set to particular ONNX nodes, only the subgraph between (and + including) those nodes is executed. + """ if not model.check_all_tensor_shapes_specified(): raise Exception("Found unspecified tensor shapes, try infer_shapes") @@ -161,8 +166,28 @@ def execute_onnx(model, input_dict, return_full_exec_context=False): # execute the model node by node # we can simply walk down the list since the ONNX spec guarantees that it is # topologically sorted - for node in graph.node: + subgraph = [] + if start_node is None: + start_node = model.graph.node[0] + if end_node is None: + end_node = model.graph.node[-1] + # select the nodes between specified start/end nodes + start_ind = model.get_node_index(start_node) + end_ind = model.get_node_index(end_node) + 1 + assert end_ind >= start_ind, "Start/end nodes must define valid subgraph" + subgraph = graph.node[start_ind:end_ind] + for node in subgraph: + if get_sanitize_quant_tensors() != 0: + # round input values to match quantization annotation + execution_context = sanitize_quant_values( + model, node.input, execution_context + ) execute_node(node, execution_context, graph) + if get_sanitize_quant_tensors() != 0: + # round output values to quantization annotation + execution_context = sanitize_quant_values( + model, node.output, execution_context + ) elif model_exec_mode == "remote_pynq": # use remote exec metadata built into model to execute on a remote PYNQ remote_exec(model, execution_context) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index c4800011fe2c944fa877b12d0247795beda4a5e6..71c731f96ca45519c443a5f932ead050770e17de 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -173,9 +173,15 @@ class HLSCustomOp(CustomOp): of the node as a dictionary.""" ret = dict() ret["BRAM_18K"] = self.bram_estimation() + ret["BRAM_efficiency"] = self.bram_efficiency_estimation() ret["LUT"] = self.lut_estimation() return ret + def bram_efficiency_estimation(self): + """Function for BRAM efficiency estimation: actual parameter storage + needed divided by the allocated BRAM storage (from estimation)""" + return 1 + def bram_estimation(self): """Function for BRAM resource estimation, is member function of HLSCustomOp class but has to be filled by every node""" diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py similarity index 88% rename from src/finn/custom_op/fpgadataflow/fmpadding.py rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..d326ae7dfc7830a0081c3b13233d67ef08b12eff 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp): "Padding": ("i", True, 2), # number of channels in input image "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), # FINN input datatype "inputDataType": ("s", True, ""), # controls distribution of padded pixels @@ -55,20 +57,22 @@ class FMPadding_Batch(HLSCustomOp): return oshape def get_folded_input_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_input_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) def get_folded_output_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_output_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() @@ -114,15 +118,13 @@ class FMPadding_Batch(HLSCustomOp): def get_instream_width(self): ibits = self.get_input_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return ibits * num_ch + simd = self.get_nodeattr("SIMD") + return ibits * simd def get_outstream_width(self): obits = self.get_output_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return obits * num_ch + simd = self.get_nodeattr("SIMD") + return obits * simd def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() @@ -135,13 +137,15 @@ class FMPadding_Batch(HLSCustomOp): self.code_gen_dict["$DEFINES$"] = [ """#define ImgDim1 {}\n#define OutputDim1 {}\n #define Padding1 {}\n#define NumChannels1 {}\n - #define PaddingStyle1 {}\n#define numReps {}\n""".format( + #define PaddingStyle1 {}\n#define numReps {} + #define SIMD1 {}\n""".format( self.get_nodeattr("ImgDim"), self.get_padded_odim(), self.get_nodeattr("Padding"), self.get_nodeattr("NumChannels"), self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), + self.get_nodeattr("SIMD"), ) ] @@ -176,7 +180,7 @@ class FMPadding_Batch(HLSCustomOp): in_t = self.get_input_datatype().get_hls_datatype_str() node = self.onnx_node self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, OutputDim1, Padding1, NumChannels1, + """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format( node.op_type, in_t ) @@ -232,6 +236,7 @@ class FMPadding_Batch(HLSCustomOp): node = self.onnx_node exp_ishape = self.get_normal_input_shape() exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() folded_oshape = self.get_folded_output_shape() if mode == "cppsim": @@ -254,10 +259,8 @@ class FMPadding_Batch(HLSCustomOp): match expected shape (1, ImgDim, ImgDim, NumChannels).""" export_idt = self.get_input_datatype() - # no reshaping for input since assuming no folding on input - # make copy before saving array - inp = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), inp) + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) if mode == "cppsim": # execute the precompiled model diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..c7edc24d0e24eef1154293caca2519ab3aa68358 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -0,0 +1,395 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import numpy as np + +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.core.datatype import DataType +from onnx import TensorProto, helper +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class Pool_Batch(HLSCustomOp): + """Class that corresponds to finn-hlslib Pool_batch function. + Requires ConvolutionInputGenerator(depthwise == 1) to format its input + + TODO: explain input shape (to reuse im2col code) + Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels) + Output shape (BatchSize,OutImgDim,OutImgDim,Channels) + + # note: the actual data layout produced by the hlslib kernels is different + # for depthwise ops. + # * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + + Channels can be folded using PE (SIMD from the input perspective) + TODO: doc + """ + + def get_nodeattr_types(self): + my_attrs = { + "Channels": ("i", True, 0), + "PE": ("i", True, 1), + "KernelSize": ("i", True, 0), + # Function: + # - MaxPool + # - AvgPool (not yet supported, but HLSLIB does) + # - AccPool (not yet supported, but HLSLIB does) + "Function": ("s", True, ""), + "OutImgDim": ("i", True, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + "BatchSize": ("i", False, 1), + } + + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + # Same as input + return DataType[self.get_nodeattr("dataType")] + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + def get_normal_input_shape(self): + ifm_ch = self.get_nodeattr("Channels") + odim = self.get_nodeattr("OutImgDim") + batch_size = self.get_nodeattr("BatchSize") + k = self.get_nodeattr("KernelSize") + ishape = (batch_size, odim, odim, k * k * ifm_ch) + return ishape + + def get_folded_input_shape(self): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(normal_ishape[-1] / pe) + folded_ishape = normal_ishape[:-1] + [fold, pe] + return tuple(folded_ishape) + + def get_normal_output_shape(self): + ofm_ch = self.get_nodeattr("Channels") + odim = self.get_nodeattr("OutImgDim") + batch_size = self.get_nodeattr("BatchSize") + oshape = (batch_size, odim, odim, ofm_ch) + return oshape + + def get_folded_output_shape(self): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(ifm_ch / pe) + folded_oshape = normal_oshape[:-1] + [fold, pe] + return tuple(folded_oshape) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[1:-1]) + + def get_instream_width(self): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + # ofm_ch = self.get_nodeattr("Channels") + # k = self.get_nodeattr("KernelSize") + # assert ifm_ch % pe == 0, "PE must divide input channels" + # simd = int(ifm_ch/pe) + in_width = int(dt_bits * pe) + return in_width + + def get_outstream_width(self): + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + return self.get_instream_width() + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + info_messages = [] + + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""Pool_Batch needs 1 data input""") + + # check supported function + fnx = self.get_nodeattr("Function") + if fnx == "MaxPool": + info_messages.append( + "Attribute Function contains a supported pool function" + ) + else: + info_messages.append( + "Attribute Function contains an unsupported pool function" + ) + return info_messages + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "pool.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("Channels") + self.code_gen_dict["$DEFINES$"] += ["#define Channels {}".format(ifm_ch)] + + pe = self.get_nodeattr("PE") + self.code_gen_dict["$DEFINES$"] += ["#define PE {}".format(pe)] + + k = self.get_nodeattr("KernelSize") + self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k)] + + odim = self.get_nodeattr("OutImgDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + numReps = self.get_nodeattr("BatchSize") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(numReps)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + idt = self.get_input_datatype() + i_hls_dt = idt.get_hls_datatype_str() + odt = self.get_output_datatype() + o_hls_dt = odt.get_hls_datatype_str() + + self.code_gen_dict["$DOCOMPUTE$"] = [] + + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + self.code_gen_dict["$DOCOMPUTE$"] += [ + "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt) + ] + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + self.code_gen_dict["$DOCOMPUTE$"] += [ + """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > > + (in0,out, pool_fxn, OFMDim*OFMDim*numReps);""".format( + i_hls_dt, o_hls_dt + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_ibits = self.get_instream_width() + packed_in_hls_type = "ap_uint<%d>" % packed_ibits + + packed_obits = self.get_outstream_width() + packed_out_hls_type = "ap_uint<%d>" % packed_obits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (batch_size,odim,odim,k*k*ifm_ch).""" + + export_idt = self.get_input_datatype() + reshaped_input = inp.reshape(folded_ishape) + + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 9b73ba1e100aa83fd19aa8799195c99891fca3fd..a7ebff68749120868cae9ce5ac18d2856fe2cb8a 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -240,11 +240,21 @@ class StreamingFCLayer_Batch(HLSCustomOp): Q = self.get_nodeattr("SIMD") wdt = self.get_weight_datatype() W = wdt.bitwidth() - D_in = self.get_instream_width() - D_out = self.get_outstream_width() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") omega = (D_in * D_out) / (Q * P) return P * (math.ceil(omega / 512)) * (math.ceil((Q * W) / 36)) + def bram_efficiency_estimation(self): + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + bram16_est = self.bram_estimation() + wbits = W * D_in * D_out + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -290,12 +300,15 @@ class StreamingFCLayer_Batch(HLSCustomOp): return out_width def get_weightstream_width(self): - """Returns weight stream width. Used in decoupled mode.""" - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wp = self.get_weight_datatype().bitwidth() - w_width = pe * simd * wp - return w_width + """Returns weight stream width. Used only in decoupled mode.""" + if self.get_nodeattr("mem_mode") == "decoupled": + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * simd * wp + return w_width + else: + return 0 def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 1a8216f64bf71b7fb9f1f8becf4732970b5bf451..1da60a5124fa86b4336bae8fd1a587672f2f2e6f 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -99,6 +99,7 @@ set_top $config_toplevelfxn open_solution sol1 set_part $config_proj_part +config_compile -ignore_long_run_time -disable_unroll_code_size_check config_interface -m_axi_addr64 config_rtl -auto_prefix $EXTRA_DIRECTIVES$ diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 25ea05e3607a52731ae1b64de421837bf137ee2b..17ba44b959577faf573d77ae222f7b2a3be6669d 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -30,20 +30,30 @@ from finn.custom_op.fpgadataflow import HLSCustomOp class TLastMarker(HLSCustomOp): - """Class that corresponds to the TLastMarker node that needs to be - inserted at the end of the model for rtlsim with stitched IP. - It marks the end of the current image/input sample.""" + """Node that adds/removes AXI stream TLAST signals where needed. Its behavior + is transparent in node-by-node execution, only visible in IP-stitched rtlsim or + actual hardware. + This node may be needed at the end of the network to signal a DMA write (needed by the + FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read.""" def __init__(self, onnx_node): super().__init__(onnx_node) def get_nodeattr_types(self): my_attrs = { + # number of (static) iterations until TLAST=1 is generated for Direction=out "NumIters": ("i", True, 0), + # whether static or dynamic (from AXI lite) number of iterations are used + "DynIters": ("i", False, 1), + # direction: whether to insert or remove TLAST + "Direction": ("s", False, "out"), # width of input-output data streams, in bits "StreamWidth": ("i", True, 0), # width of individual element in stream, in bits "ElemWidth": ("i", True, 0), + # Protocol: external or internal + # Vitis docs recommend using qdma_axis for external, ap_axiu for internal + "Protocol": ("s", False, "external"), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -76,12 +86,33 @@ class TLastMarker(HLSCustomOp): def defines(self, var): stream_width = self.get_nodeattr("StreamWidth") + direction = self.get_nodeattr("Direction") + protocol = self.get_nodeattr("Protocol") # output stream must have TLAST, so we use this stream data type: # qdma_axis<stream_data_width,0,0,0 > - out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + if direction == "out": + if protocol == "external": + out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + out_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + in_stream_dtype = "ap_uint<%d>" % stream_width + elif direction == "in": + out_stream_dtype = "ap_uint<%d>" % stream_width + if protocol == "external": + in_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + in_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + else: + raise Exception("Unrecognized Direction in TLastMarker") + self.code_gen_dict["$DEFINES$"] = [ "#define StreamWidth %d" % stream_width, "#define OutDType %s" % out_stream_dtype, + "#define InDType %s" % in_stream_dtype, "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"), ] @@ -89,27 +120,60 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$READNPYDATA$"] = [] def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - "unsigned int n = 1;", - "OutDType t;", - "t.set_keep(-1);", - "io_section: { // start of cycle accurate region", - "#pragma HLS protocol fixed", - "// do a first read from stream before we decide on numIters", - "// giving software a chance to set up the numIters prior to startup", - "t.set_data(in0.read());", - "n = (numIters == 0 ? NumItersPerImg : numIters);", - "t.set_last(n==1);", - "out.write(t);", - "} // end of cycle accurate region", - "// do one less iteration than spec since we already did one", - "for(unsigned int i=1; i<n; i++) {", - "#pragma HLS PIPELINE II=1", - "t.set_data(in0.read());", - "t.set_last(i==(n-1));", - "out.write(t);", - "}", - ] + dyn_iters = self.get_nodeattr("DynIters") + direction = self.get_nodeattr("Direction") + use_qdma_axis = self.get_nodeattr("Protocol") == "external" + if direction == "in": + # read from input and just pass data along; ignore tlast + # no dyn iters on input, it doesnt make sense + self.code_gen_dict["$DOCOMPUTE$"] = [ + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "out.write(in0.read().get_data());" + if use_qdma_axis + else "out.write(in0.read().data);", + "}", + ] + + elif dyn_iters == 1: + # output, with dynamic iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "io_section: { // start of cycle accurate region", + "#pragma HLS protocol fixed", + "// do a first read from stream before we decide on numIters", + "// giving software a chance to set up the numIters prior to startup", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "n = (numIters == 0 ? NumItersPerImg : numIters);", + "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);", + "out.write(t);", + "} // end of cycle accurate region", + "// do one less iteration than spec since we already did one", + "for(unsigned int i=1; i<n; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));", + "out.write(t);", + "}", + ] + + else: + # output, with static iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(NumItersPerImg-1));" + if use_qdma_axis + else "t.last = (i==(NumItersPerImg-1));", + "out.write(t);", + "}", + ] def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = [] @@ -118,18 +182,30 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void %s(hls::stream<ap_uint<StreamWidth> > &in0, - hls::stream<OutDType> &out, unsigned int numIters)""" - % self.onnx_node.name - ] + dyn_iters = self.get_nodeattr("DynIters") + + if dyn_iters == 1: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, + hls::stream<OutDType> &out, unsigned int numIters)""" + % self.onnx_node.name + ] + else: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)""" + % self.onnx_node.name + ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" - ) + + dyn_iters = self.get_nodeattr("DynIters") + if dyn_iters == 1: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" + ) + self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) @@ -158,7 +234,7 @@ class TLastMarker(HLSCustomOp): def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream<InDType> in0 ("in0");' ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<OutDType> out ("out");' diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py index 82a6b140f7af1be4e5c0f429d077b99c7865383e..8ed0041704d421dab587f08bcbcd9e739e8434e9 100644 --- a/src/finn/custom_op/im2col.py +++ b/src/finn/custom_op/im2col.py @@ -80,6 +80,8 @@ class Im2Col(CustomOp): "input_shape": ("s", True, ""), "pad_amount": ("i", False, 0), "pad_value": ("i", False, 0), + # depthwise: if != 0, infer ConvolutionInputGenerator with depthwise == 1 + "depthwise": ("i", False, 0), } def make_shape_compatible_op(self, model): diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5c78bc0c8419ba519c5c3113d9b0c7ae2dd3b7 --- /dev/null +++ b/src/finn/custom_op/quantavgpool2d.py @@ -0,0 +1,128 @@ +import numpy as np +from onnx import TensorProto, helper +import onnxruntime as rt + +from finn.custom_op import CustomOp +from finn.core.datatype import DataType +from finn.custom_op.maxpoolnhwc import compute_pool_output_dim + + +class QuantAvgPool2d(CustomOp): + """Class that corresponds to the quantized average pooling + layer from brevitas""" + + def get_nodeattr_types(self): + return { + "stride": ("i", True, 1), + "kernel": ("i", True, 1), + "ibits": ("i", True, 1), + "obits": ("i", True, 1), + # determines if values are signed (set to "1") or unsigned ("0") + "signed": ("i", True, 0), + # data layout attribute can be set to "NCHW" or "NHWC" + "data_layout": ("s", False, "NCHW"), + } + + def make_shape_compatible_op(self, model): + node = self.onnx_node + k = self.get_nodeattr("kernel") + s = self.get_nodeattr("stride") + data_layout = self.get_nodeattr("data_layout") + if data_layout == "NCHW": + return helper.make_node( + "AveragePool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=[k, k], + strides=[s, s], + ) + elif data_layout == "NHWC": + iname = node.input[0] + ishape = model.get_tensor_shape(iname) + (n, hi, wi, c) = ishape + ho = compute_pool_output_dim(hi, k, s) + wo = compute_pool_output_dim(wi, k, s) + oshape = (n, ho, wo, c) + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + else: + raise Exception( + """Datalayout for QuantAvgPool2d is set to an invalid value. + Has to be set to "NCHW" or "NHWC".""" + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + bw = self.get_nodeattr("obits") + if bw in [2, 4, 8, 16, 32]: + if self.get_nodeattr("signed") == 0: + dtype = DataType["UINT%d" % bw] + else: + dtype = DataType["INT%d" % bw] + else: + raise Exception("Unsupported output datatype for QuantAvgPool2d") + model.set_tensor_datatype(node.output[0], dtype) + + def execute_node(self, context, graph): + # create a standard average pooling node to help calculate the result + node = self.onnx_node + k = self.get_nodeattr("kernel") + s = self.get_nodeattr("stride") + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + if self.get_nodeattr("data_layout") == "NHWC": + inp_values = inp_values.transpose(0, 3, 1, 2) + oshape = (context[node.output[0]]).transpose(0, 3, 1, 2).shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_avgpool = helper.make_node( + "AveragePool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=[k, k], + strides=[s, s], + ) + graph_avgpool = helper.make_graph( + nodes=[node_avgpool], + name="single-avgpool-exec", + inputs=[inp], + outputs=[outp], + ) + model_avgpool = helper.make_model(graph_avgpool) + idict = {node.input[0]: inp_values} + sess = rt.InferenceSession(model_avgpool.SerializeToString()) + result_temp = sess.run(None, idict) + # remove scaling introduced by average + result_temp = result_temp[0] * (k * k) + ibits = self.get_nodeattr("ibits") + max_value = 2 ** ibits - 1 + max_value = max_value * k * k + max_bit_width = int(max_value).bit_length() + shift_bits = max_bit_width - self.get_nodeattr("obits") + result = np.right_shift(result_temp.astype(int), shift_bits) + if self.get_nodeattr("data_layout") == "NHWC": + result = result.transpose(0, 2, 3, 1) + context[node.output[0]] = result.astype(np.float32) + + def verify_node(self): + info_messages = [] + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + return info_messages diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 614a3d7ffd70d0b102bad2b76177a2d3b32765c7..0060e5d400f30055d532671c8cf1680f0668442a 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -44,10 +44,12 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch -from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch +from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch +from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch +from finn.custom_op.quantavgpool2d import QuantAvgPool2d from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch # create a mapping of all known CustomOp names and classes @@ -65,10 +67,12 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch +custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch +custom_op["QuantAvgPool2d"] = QuantAvgPool2d custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch diff --git a/src/finn/transformation/change_datalayout.py b/src/finn/transformation/change_datalayout.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b393a25e57122b059a44f70904a6dbe5bbaa3f --- /dev/null +++ b/src/finn/transformation/change_datalayout.py @@ -0,0 +1,110 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from onnx import helper, TensorProto + +from finn.transformation import Transformation +from finn.transformation.infer_shapes import InferShapes +from finn.util.basic import get_by_name + + +class ChangeDataLayoutQuantAvgPool2d(Transformation): + """Replace QuantAvgPool2d with datalayout (N,C,H,W) with Transpose nodes + and QuantAvgPool2dNHWC with datalayout (N,H,W,C)""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "QuantAvgPool2d" and ( + get_by_name(n.attribute, "data_layout") is None + or get_by_name(n.attribute, "data_layout").s.decode("UTF-8") == "NCHW" + ): + graph_modified = True + node_input = n.input[0] + node_output = n.output[0] + s = get_by_name(n.attribute, "stride").i + k = get_by_name(n.attribute, "kernel").i + ibits = get_by_name(n.attribute, "ibits").i + obits = get_by_name(n.attribute, "obits").i + signed = get_by_name(n.attribute, "signed").i + batchsize = model.get_tensor_shape(n.input[0])[0] # assume NCHW + channels = model.get_tensor_shape(n.input[0])[1] # assume NCHW + idim = model.get_tensor_shape(n.input[0])[-1] # assume NCHW + odim = model.get_tensor_shape(n.output[0])[-1] # assume NCHW + + # create new nodes + # NCHW -> NHWC + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (batchsize, idim, idim, channels), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + quantavg_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (batchsize, odim, odim, channels), + ) + graph.value_info.append(quantavg_out) + quantavg_out = quantavg_out.name + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + quantavg_node = helper.make_node( + "QuantAvgPool2d", + [inp_trans_out], + [quantavg_out], + domain="finn", + stride=s, + kernel=k, + ibits=ibits, + obits=obits, + signed=signed, + data_layout="NHWC", + ) + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [quantavg_out], [node_output], perm=[0, 3, 1, 2] + ) + # insert nodes + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, quantavg_node) + graph.node.insert(node_ind + 2, out_trans_node) + # remove old nodes + graph.node.remove(n) + + # set shapes + model.set_tensor_shape(inp_trans_out, (batchsize, idim, idim, channels)) + model.set_tensor_shape(quantavg_out, (batchsize, odim, odim, channels)) + model = model.transform(InferShapes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index 207075b00de1871da19ea78472125d435449ed6e..62ee92df54eee2b63d84657515d7fbc3a8808b81 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -69,6 +69,9 @@ class AnnotateResources(Transformation): total_dict[r_type] += r_amount else: total_dict[r_type] = r_amount + for k in total_dict.keys(): + if "efficiency" in k: + total_dict[k] = total_dict[k] / len(graph.node) model.set_metadata_prop("res_total_" + self.mode, str(total_dict)) for node in graph.node: if _is_fpgadataflow_node(node) and node.name in res_dict.keys(): diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index d421a5f3ef8ca980b399087de1482b2ae913da1b..b70b126680d650547cf376dd601c048c73a1cfd4 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -34,6 +34,7 @@ from finn.custom_op.registry import getCustomOp from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes import finn.core.data_layout as DataLayout +from finn.util.basic import get_by_name class InferConvInpGen(Transformation): @@ -56,6 +57,7 @@ class InferConvInpGen(Transformation): k = i2c_inst.get_nodeattr("kernel_size") pad = i2c_inst.get_nodeattr("pad_amount") pad_val = i2c_inst.get_nodeattr("pad_value") + depthwise = i2c_inst.get_nodeattr("depthwise") ifm_ch = i2c_in_shape[-1] ifm_dim = i2c_in_shape[1] ofm_dim = i2c_out_shape[1] @@ -67,7 +69,11 @@ class InferConvInpGen(Transformation): if pad > 0: # if padding enabled, ensure pad_val supported by DataType - assert dt.allowed(pad_val), "Im2Col DataType must support pad_val" + # assert dt.allowed(pad_val),"""FMPadding_Batch DataType + # must support pad_val""" + assert ( + pad_val == 0 + ), "FMPadding_Batch doesn't currently support pad_val!= 0" odim_padding = ifm_dim + 2 * pad @@ -112,6 +118,7 @@ class InferConvInpGen(Transformation): Stride=stride, inputDataType=dt.name, outputDataType=dt.name, + depthwise=depthwise, ) graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes @@ -169,6 +176,137 @@ class InferStreamingMaxPool(Transformation): return (model, graph_modified) +class InferPool_Batch(Transformation): + """If kernel_shape > strides, replace Pool layer with with of Im2col + + pool(with kernel_shape == strides), plus Transpose layers to keep the original + data layout.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type in ["MaxPool"]: + # extract pool parameters + k = get_by_name(n.attribute, "kernel_shape").ints[-1] + stride = get_by_name(n.attribute, "strides").ints[-1] + + if k <= stride: + continue + + try: + pad = get_by_name(n.attribute, "pads").ints[-1] + except AttributeError: + pad = 0 + + node_input = n.input[0] + node_output = n.output[0] + idt = model.get_tensor_datatype(node_input) + if not idt.is_integer(): + continue + + # odt = model.get_tensor_datatype(node_output) + + ifm_ch = model.get_tensor_shape(n.input[0])[1] # assume NCHW + ofm_ch = ifm_ch + ifm_dim = model.get_tensor_shape(n.input[0])[-1] # assume NCHW + ofm_dim = model.get_tensor_shape(n.output[0])[-1] # assume NCHW + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ifm_dim, ifm_dim, ifm_ch), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + model.set_tensor_datatype(inp_trans_out, idt) + + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim, ofm_dim, ifm_ch * k * k), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) + + pool_output = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim, ofm_dim, ofm_ch), + ) + graph.value_info.append(pool_output) + pool_output = pool_output.name + # model.set_tensor_datatype(pool_output, odt) + + # create new nodes + # NCHW -> NHWC + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + + if n.op_type == "MaxPool": + pool_fxn = "MaxPool" + pad_value = idt.min() + else: + raise Exception( + "pad_value and pool_fxn not configured for {}".format(n.op_type) + ) + + # format input tensor + im2col_node = helper.make_node( + "Im2Col", + [inp_trans_out], + [im2col_out], + domain="finn", + stride=stride, + kernel_size=k, + pad_amount=pad, + pad_value=pad_value, + depthwise=1, + input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch), + ) + + # Warning PE has to be equal to ifm_ch until Im2Col is replaced by + # ConvolutionInputGenerator with depthwise=1. + # For other settings the output will be incorrect due to incorrect input + # data layout + pool_node = helper.make_node( + "Pool_Batch", + [im2col_out], + [pool_output], + domain="finn", + backend="fpgadataflow", + dataType=idt.name, + Channels=ifm_ch, + PE=ifm_ch, + KernelSize=k, + Function=pool_fxn, + OutImgDim=ofm_dim, + BatchSize=1, + ) + + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] + ) + + # insert nodes where the conv is to preserve topological ordering + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, im2col_node) + graph.node.insert(node_ind + 2, pool_node) + graph.node.insert(node_ind + 3, out_trans_node) + # remove old node + graph.node.remove(n) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferBinaryStreamingFCLayer(Transformation): """Convert XnorPopcountMatMul layers to StreamingFCLayer_Batch layers. Any immediately following MultiThreshold diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 32f32ece585a93465ba32fede45d5eb606a2b0a3..04dd437af27b9fbe18b2255c20a8e4acda03b3d0 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -31,23 +31,34 @@ from onnx import helper as oh from finn.custom_op.registry import getCustomOp from finn.transformation import Transformation +from finn.util.basic import get_by_name + +import numpy as np class InsertTLastMarker(Transformation): - """Ensure that the graph is terminated with a TLastMarker node, inserting - one if necessary.""" + """Ensure that the graph is started/terminated with a TLastMarker node, inserting + one if necessary. Use constructor args to determine type of TLastMarker to be inserted. + More information available on the TLastMarker documentation. + """ - def __init__(self): + def __init__(self, both=False, external=True, dynamic=True): super().__init__() + self.dyniters = dynamic + self.external = external + self.both = both def apply(self, model): # TODO only makes sense for a pure fpgadataflow graph -- check! graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) - if final_node.op_type == "TLastMarker": - # TODO maybe check the correctness of properties - return (model, False) - else: + graph_modified = False + if final_node.op_type != "TLastMarker" and not ( + final_node.op_type == "IODMA" + and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") + == "out" + ): + custom_op = getCustomOp(final_node) num_iters = int(custom_op.get_number_output_values()) stream_width = int(custom_op.get_outstream_width()) @@ -69,8 +80,51 @@ class InsertTLastMarker(Transformation): NumIters=num_iters, StreamWidth=stream_width, ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="out", + Protocol=("external" if self.external else "internal"), domain="finn", backend="fpgadataflow", ) model.graph.node.append(tlast_node) - return (model, True) + graph_modified = True + # if both is True, also insert marker on input + if self.both: + graph_in_name = model.graph.input[0].name + first_node = model.find_consumer(graph_in_name) + if first_node.op_type != "TLastMarker" and not ( + first_node.op_type == "IODMA" + and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") + == "in" + ): + + custom_op = getCustomOp(first_node) + num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) + stream_width = int(custom_op.get_instream_width()) + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + elem_width = in_dtype.bitwidth() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + # reroute final node output to first_node_in_name + first_node.input[0] = first_node_in.name + tlast_node = oh.make_node( + "TLastMarker", + [graph_in_name], + [first_node_in.name], + NumIters=num_iters, + StreamWidth=stream_width, + ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="in", + Protocol=("external" if self.external else "internal"), + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(0, tlast_node) + graph_modified = True + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index a1524322ec03a4e96ef41f999144e3eed349c5af..6eae560e1191642cfaf85d92c6d0fcf644630973 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -29,9 +29,12 @@ import os import finn.custom_op.registry as registry -from finn.transformation import Transformation from finn.util.basic import make_build_dir from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.transformation import Transformation +from finn.util.basic import get_num_default_workers +import multiprocessing as mp +import copy def _codegen_single_node(node, model): @@ -66,8 +69,39 @@ class PrepareCppSim(Transformation): that contains generated C++ code that can be used to simulate node using cppsim. The subsequent transformation is CompileCppSim""" + def __init__(self, num_workers=None): + super().__init__() + if num_workers is None: + self._num_workers = get_num_default_workers() + else: + self._num_workers = num_workers + assert self._num_workers >= 0, "Number of workers must be nonnegative." + if self._num_workers == 0: + self._num_workers = mp.cpu_count() + + def prepareCppSim_node(self, node): + if is_fpgadataflow_node(node) is True: + _codegen_single_node(node, self.model) + return (node, False) + def apply(self, model): - for node in model.graph.node: - if is_fpgadataflow_node(node) is True: - _codegen_single_node(node, model) - return (model, False) + # Remove old nodes from the current model + self.model = copy.deepcopy(model) + old_nodes = [] + for i in range(len(model.graph.node)): + old_nodes.append(model.graph.node.pop()) + + # Execute transformation in parallel + with mp.Pool(self._num_workers) as p: + new_nodes_and_bool = p.map(self.prepareCppSim_node, old_nodes, chunksize=1) + + # extract nodes and check if the transformation needs to run again + # Note: .pop() had initially reversed the node order + run_again = False + for node, run in reversed(new_nodes_and_bool): + # Reattach new nodes to old model + model.graph.node.append(node) + if run is True: + run_again = True + + return (model, run_again) diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py index 9ac75578ffb911cc44cfddc2b2119b55e6abf2dd..e7a6b88239a1735d5379e165333f8356ae6f88a1 100644 --- a/src/finn/transformation/infer_data_layouts.py +++ b/src/finn/transformation/infer_data_layouts.py @@ -38,7 +38,7 @@ def _dims_to_layout(model, node, ndims): return DataLayout.NC else: if node.domain == "finn": - if node.op_type == "MultiThreshold": + if node.op_type == "MultiThreshold" or node.op_type == "QuantAvgPool2d": mt_inst = registry.getCustomOp(node) layout = mt_inst.get_nodeattr("data_layout") if layout == "NHWC" and ndims == 4: diff --git a/src/finn/transformation/infer_datatypes.py b/src/finn/transformation/infer_datatypes.py index 1acd4e3abe2d77248810cf15c15475e806a3bd32..39b7a787be8c725e7b6d474757dd96fc4848dfe0 100644 --- a/src/finn/transformation/infer_datatypes.py +++ b/src/finn/transformation/infer_datatypes.py @@ -71,7 +71,13 @@ def _infer_node_datatype(model, node): else: # unknown, assume node produces float32 outputs for o in node.output: - model.set_tensor_datatype(o, DataType.FLOAT32) + # check if output datatype is already set to a value != FLOAT32 + odtype = model.get_tensor_datatype(o) + if odtype is not None and odtype != DataType.FLOAT32: + # don't change data type + model.set_tensor_datatype(o, odtype) + else: + model.set_tensor_datatype(o, DataType.FLOAT32) # compare old and new output dtypes to see if anything changed new_odtypes = list(map(lambda x: model.get_tensor_datatype(x), node.output)) graph_modified = new_odtypes != odtypes diff --git a/src/finn/transformation/lower_convs_to_matmul.py b/src/finn/transformation/lower_convs_to_matmul.py index 3da785d8dd21b2c6701bffc8ce3869fb14b237a9..aa231a43a3865a161a501b4997ff2f538800554f 100644 --- a/src/finn/transformation/lower_convs_to_matmul.py +++ b/src/finn/transformation/lower_convs_to_matmul.py @@ -80,14 +80,19 @@ class LowerConvsToMatMul(Transformation): inp_trans_out = inp_trans_out.name model.set_tensor_datatype(inp_trans_out, idt) - im2col_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ofm_dim, ofm_dim, ifm_ch * k * k), - ) - graph.value_info.append(im2col_out) - im2col_out = im2col_out.name - model.set_tensor_datatype(im2col_out, idt) + need_im2col = True + if k == 1 and pad == 0 and stride == 1: + need_im2col = False + + if need_im2col: + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim, ofm_dim, ifm_ch * k * k), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) matmul_out = helper.make_tensor_value_info( model.make_new_valueinfo_name(), @@ -104,19 +109,23 @@ class LowerConvsToMatMul(Transformation): "Transpose", [cnv_input], [inp_trans_out], perm=[0, 2, 3, 1] ) # lower input tensor - im2col_node = helper.make_node( - "Im2Col", - [inp_trans_out], - [im2col_out], - domain="finn", - stride=stride, - kernel_size=k, - pad_amount=pad, - input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch), - ) + matmul_input = inp_trans_out + if need_im2col: + matmul_input = im2col_out + im2col_node = helper.make_node( + "Im2Col", + [inp_trans_out], + [im2col_out], + domain="finn", + stride=stride, + kernel_size=k, + pad_amount=pad, + input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch), + ) + # do matmul matmul_node = helper.make_node( - "MatMul", [im2col_out, weight_name], [matmul_out] + "MatMul", [matmul_input, weight_name], [matmul_out] ) # NHWC -> NCHW out_trans_node = helper.make_node( @@ -124,9 +133,13 @@ class LowerConvsToMatMul(Transformation): ) # insert nodes where the conv is to preserve topological ordering graph.node.insert(node_ind, inp_trans_node) - graph.node.insert(node_ind + 1, im2col_node) - graph.node.insert(node_ind + 2, matmul_node) - graph.node.insert(node_ind + 3, out_trans_node) + if need_im2col: + graph.node.insert(node_ind + 1, im2col_node) + graph.node.insert(node_ind + 2, matmul_node) + graph.node.insert(node_ind + 3, out_trans_node) + else: + graph.node.insert(node_ind + 1, matmul_node) + graph.node.insert(node_ind + 2, out_trans_node) # remove old nodes graph.node.remove(n) model = model.transform(InferShapes()) diff --git a/src/finn/transformation/remove_identity.py b/src/finn/transformation/remove_identity.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a58d59c1bb8ff643e691442e7eda3c0516aa5c --- /dev/null +++ b/src/finn/transformation/remove_identity.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.transformation import Transformation + + +def _is_identity(node, model): + if node.op_type == "Mul": + scale = model.get_initializer(node.input[1]) + if scale is not None: + return (scale == 1).all() + elif node.op_type == "Add": + bias = model.get_initializer(node.input[1]) + if bias is not None: + return (bias == 0).all() + return False + + +class RemoveIdentity(Transformation): + """Remove nodes that apply identity ops from the graph, including: + * Multiply by 1 + * Add 0 + .""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if _is_identity(node, model): + node_src = node.input[0] + node_dst = node.output[0] + graph.node.remove(node) + model.rename_tensor(node_dst, node_src) + graph_modified = True + return (model, graph_modified) diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py index c9c73fa4c8303ee28bc1cc6aee879d633740e01e..d9c12a20975084705b801c0ff027d4b99aff9490 100644 --- a/src/finn/transformation/streamline/__init__.py +++ b/src/finn/transformation/streamline/__init__.py @@ -41,6 +41,7 @@ from finn.transformation.streamline.absorb import ( FactorOutMulSignMagnitude, Absorb1BitMulIntoMatMul, Absorb1BitMulIntoConv, + AbsorbSignBiasIntoMultiThreshold, ) from finn.transformation.streamline.collapse_repeated import ( @@ -59,6 +60,7 @@ from finn.transformation.streamline.reorder import ( from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.transformation.streamline.sign_to_thres import ConvertSignToThres from finn.transformation.batchnorm_to_affine import BatchNormToAffine +from finn.transformation.streamline.remove import RemoveIdentityOps class Streamline(Transformation): @@ -70,6 +72,7 @@ class Streamline(Transformation): ConvertDivToMul(), BatchNormToAffine(), ConvertSignToThres(), + AbsorbSignBiasIntoMultiThreshold(), MoveAddPastMul(), MoveScalarAddPastMatMul(), MoveScalarAddPastConv(), @@ -87,6 +90,7 @@ class Streamline(Transformation): ] for trn in streamline_transformations: model = model.transform(trn) + model = model.transform(RemoveIdentityOps()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index dbcf97361017144174f9fbfca35a84361b5abd26..dc01eea411fc1f640e481c9be02a92acdd59533f 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -28,14 +28,80 @@ import numpy as np from onnx import helper as oh +import warnings from finn.core.datatype import DataType from finn.transformation import Transformation from finn.util.basic import get_by_name from finn.custom_op.registry import getCustomOp +from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes +class AbsorbSignBiasIntoMultiThreshold(Transformation): + """Absorb scalar bias originating from signed int export back into + MultiThreshold and re-evaluate the output datatype.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + # search for (MultiThreshold, Add) pair + node_ind += 1 + if ( + n.op_type == "MultiThreshold" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + consumer = model.find_consumer(n.output[0]) + if consumer is not None and consumer.op_type == "Add": + mt_node = n + add_node = consumer + threshold_name = mt_node.input[1] + add_weight_name = add_node.input[1] + T = model.get_initializer(threshold_name) + A = model.get_initializer(add_weight_name) + if (A is None) or (T is None): + warnings.warn("Threshold or add bias not constant, skipping") + continue + end_name = add_node.output[0] + # we can only absorb scalar adds + is_scalar = A.ndim == 0 or all(x == 1 for x in A.shape) + if not is_scalar: + continue + bias = A.flatten()[0] + # set MultiThreshold bias property + mt_inst = getCustomOp(mt_node) + bias += mt_inst.get_nodeattr("out_bias") + mt_inst.set_nodeattr("out_bias", bias) + graph_modified = True + # compute new DataType for MultiThreshold output + steps = T.shape[-1] + new_min = bias + new_max = steps + bias + odt = DataType.get_smallest_possible(steps).name.replace( + "UINT", "INT" + ) + odt = DataType[odt] + assert odt.allowed(new_max) and odt.allowed( + new_min + ), """Could + not compute new MultiThreshold DataType (min = %d max = %d)""" % ( + new_min, + new_max, + ) + mt_inst.set_nodeattr("out_dtype", odt.name) + # remove Add node, rewire MultiThreshold + graph.node.remove(add_node) + mt_node.output[0] = end_name + # set datatype + model.set_tensor_datatype(end_name, odt) + if graph_modified: + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class AbsorbAddIntoMultiThreshold(Transformation): """Absorb preceding Add ops into MultiThreshold by updating the threshold values. Only scalar/1D add vectors can be absorbed.""" @@ -290,3 +356,38 @@ class AbsorbTransposeIntoMultiThreshold(Transformation): if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class AbsorbScalarMulIntoTopK(Transformation): + """Absorb a mul node into a suceeding topk node if the mul is scalar.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "TopK": + prod = model.find_producer(n.input[0]) + if prod is not None and prod.op_type == "Mul": + prod_input = prod.input[0] + param_name = prod.input[1] + A = model.get_initializer(param_name) + if A is None: + warnings.warn("Param is not constant, skipping") + continue + if all(x == 1 for x in A.shape) and A > 0: + # if the mul is scalar and positive, we can just delete the + # mul node and rewire the top k node. Because the top k node + # works with probabilities and their relation to each other + # the relation doesn't change if every value is multiplied + # with a scalar + graph.node.remove(prod) + n.input[0] = prod_input + # to avoid error the dataype is set to float32 + model.set_tensor_datatype(n.input[0], DataType.FLOAT32) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py index 67824ad4f633983b93e3178d03118927a1ddd85b..769bed841ce07c1c9c62f762de4b2c0937a6d68f 100644 --- a/src/finn/transformation/streamline/collapse_repeated.py +++ b/src/finn/transformation/streamline/collapse_repeated.py @@ -30,6 +30,7 @@ from onnx import helper as oh from finn.transformation import Transformation from finn.transformation.infer_shapes import InferShapes +from finn.core.datatype import DataType class CollapseRepeatedOp(Transformation): @@ -83,6 +84,9 @@ class CollapseRepeatedOp(Transformation): graph.node.insert(node_ind, new_node) # replace parameter value model.set_initializer(new_node_param_name, new_param) + # be conservative with param/output DataTypes + model.set_tensor_datatype(new_node_param_name, DataType.FLOAT32) + model.set_tensor_datatype(end_name, DataType.FLOAT32) # remove old nodes graph.node.remove(n) graph.node.remove(consumer) diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc4233ddafbc70c4d20d316ea72ea6bba1b82a8 --- /dev/null +++ b/src/finn/transformation/streamline/remove.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from finn.transformation import Transformation +from finn.transformation.infer_shapes import InferShapes +import numpy as np + +class RemoveIdentityOps(Transformation): + """Remove identity ops like Add/Sub with zero or Mul/Div with one""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type in ["Add", "Sub"] + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + A = model.get_initializer(n.input[1]) + if A is not None and (A == np.zeros_like(A)).all(): + producer = model.find_producer(n.input[0]) + # remove node and wire output tensor to + # output of producer node + producer.output[0] = n.output[0] + graph.node.remove(n) + + elif ( + n.op_type in ["Mul", "Div"] + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + A = model.get_initializer(n.input[1]) + if A is not None and (A == np.ones_like(A)).all(): + producer = model.find_producer(n.input[0]) + # remove node and wire output tensor to + # output of producer node + producer.output[0] = n.output[0] + graph.node.remove(n) + model = model.transform(InferShapes()) + return (model, graph_modified) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 0b6259a61d3eb67b7b38d4c6939019ce2893a875..a1bd16f6d0b70193122d5d067ccdee395260c7b1 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -27,12 +27,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +import warnings from onnx import helper as oh from finn.transformation import Transformation from finn.transformation.infer_shapes import InferShapes +from finn.core.datatype import DataType from finn.core.onnx_exec import execute_node from finn.util.basic import get_by_name +from finn.custom_op.registry import getCustomOp class MoveAddPastMul(Transformation): @@ -336,6 +339,71 @@ class MoveScalarMulPastConv(Transformation): return (model, graph_modified) +class MoveMulPastDWConv(Transformation): + """Move channelwise mul operations past depthwise conv operations. We want to have muls + next to each other such that they can be collapsed into a single mul.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type == "Mul" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + consumer = model.find_consumer(n.output[0]) + if ( + consumer is not None + and consumer.op_type == "Conv" + and not model.is_join_node(consumer) + ): + mul_weight_name = n.input[1] + A = model.get_initializer(mul_weight_name) + if A is None: + warnings.warn( + """Mul weight tensor is not set. If it is a constant, + please use set_initializer to set the tensor.""" + ) + continue + conv_node = consumer + mul_node = n + start_name = mul_node.input[0] + conv_in_name = conv_node.input[0] + conv_in_shape = model.get_tensor_shape(conv_in_name) + ifm_ch = conv_in_shape[1] + group_attribute = get_by_name(consumer.attribute, "group") + if group_attribute is None: + continue + group_attribute = group_attribute.i + conv_out_name = conv_node.output[0] + conv_out_shape = model.get_tensor_shape(conv_out_name) + if A.shape == (1, ifm_ch, 1, 1) and ifm_ch == group_attribute: + # if the mul is channelwise and conv is depthwise, + # we can simply swap the order of ops + # rewire mul input to be conv input + conv_node.input[0] = start_name + model.set_tensor_shape(start_name, conv_in_shape) + model.set_tensor_datatype(start_name, DataType.FLOAT32) + # use old conv input tensor as conv output + conv_node.output[0] = conv_in_name + model.set_tensor_shape(conv_in_name, conv_out_shape) + model.set_tensor_datatype(conv_in_name, DataType.FLOAT32) + # use new conv output as new mul node input + mul_node.input[0] = conv_in_name + # use old conv output as new mul node output + mul_node.output[0] = conv_out_name + model.set_tensor_datatype(conv_out_name, DataType.FLOAT32) + # move mul node past conv node + graph.node.remove(mul_node) + graph.node.insert(node_ind, mul_node) + graph_modified = True + model = model.transform(InferShapes()) + return (model, graph_modified) + + class MoveLinearPastEltwiseAdd(Transformation): """Move linear operations (mul, add) past elementwise add operations where possible. Specifically,matches and transforms the following patterns: @@ -531,3 +599,67 @@ class MoveMulPastFork(MoveOpPastFork): class MoveLinearPastFork(MoveOpPastFork): def __init__(self): super().__init__(["Add", "Mul"]) + + +class MoveMaxPoolPastMultiThreshold(Transformation): + """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + nodes = [n for n in graph.node] + for n in nodes: + node_ind += 1 + if n.op_type == "MaxPool" and not model.is_fork_node(n): + consumer = model.find_consumer(n.output[0]) + pads = get_by_name(n.attribute, "pads") + has_padding = False + if pads is not None: + pads = list(pads.ints) + has_padding = np.prod(pads) != 0 + if consumer is not None and consumer.op_type == "MultiThreshold": + mt_out = consumer.output[0] + mt_odt = model.get_tensor_datatype(mt_out) + if mt_odt.signed() and has_padding: + warnings.warn( + "Skipping padded MaxPool + signed-output MultiThreshold" + ) + continue + # check for non-decreasing thresholds and nonnegative + # scale factor in MultiThreshold + # otherwise we cannot do the reordering + T = model.get_initializer(consumer.input[1]) + T_sorted = np.sort(T, axis=1) + assert ( + T == T_sorted + ).all(), "MultiThreshold must have non-decreasing thresholds" + mt_inst = getCustomOp(consumer) + if mt_inst.get_nodeattr("out_scale") < 0: + warnings.warn("Skipping MultiThreshold with negative out_scale") + continue + + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + + # swap conections + group_in = n.input[0] + # new tensor because dims change + group_middle = model.make_new_valueinfo_name() + group_out = consumer.output[0] + + consumer.input[0] = group_in + consumer.output[0] = group_middle + + n.input[0] = group_middle + n.output[0] = group_out + + # insert them back in + graph.node.insert(node_ind - 1, consumer) + graph.node.insert(node_ind, n) + + graph_modified = True + + model = model.transform(InferShapes()) + return (model, graph_modified) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index eb3d46bcd66e3dc307a679e6b8dfbb9913398d36..4a8277e08d3fc21e0b20668edf2ecad947b36647 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -31,6 +31,7 @@ import random import string import subprocess import tempfile +import warnings import numpy as np @@ -105,6 +106,25 @@ def get_finn_root(): ) +def get_execution_error_thresh(): + "Return the max error that is allowed for rounding in FINN execution." + try: + return float(os.environ["ERROR_THRESH"]) + except KeyError: + return 1e-2 + + +def get_sanitize_quant_tensors(): + """Return whether tensors with quantization annotations should be sanitized. + Enabled by default, disabling will yield faster ONNX execution but may give + incorrect results. Use with caution.""" + try: + return int(os.environ["SANITIZE_QUANT_TENSORS"]) + except KeyError: + # enabled by default + return 1 + + def make_build_dir(prefix=""): """Creates a temporary folder with given prefix to be used as a build dir. Use this function instead of tempfile.mkdtemp to ensure any generated files @@ -264,6 +284,69 @@ def calculate_signed_dot_prod_range(dt_a, dt_b, len): return (min_prod, max_prod) +def sanitize_quant_values(model, node_tensors, execution_context, check_values=False): + """ Sanitize given list of tensors in execution_context by rounding values + that are supposed to be integers (as indicated by their quantization + annotation). Will raise an assertion if the amount of rounding is too large. + Returns the sanitized execution context. + + If check_values is specified, an extra DataType.allowed() check will be + performed on any rounded tensors. + + Background: + FINN uses floating point tensors as a carrier data type to represent + integers. Floating point arithmetic can introduce rounding errors, e.g. + (int_num * float_scale) / float_scale is not always equal to int_num. + We use this function to ensure that the values that are supposed to be + integers are indeed integers. + """ + + for tensor in node_tensors: + dtype = model.get_tensor_datatype(tensor) + # floats don't need sanitization, skip to next + # introduces less quicker runtime + if dtype == DataType.FLOAT32: + continue + current_values = execution_context[tensor] + updated_values = current_values + has_to_be_rounded = False + # TODO: vectorize with numpy + for value in np.nditer(current_values): + if not dtype.allowed(value): + has_to_be_rounded = True + break + if has_to_be_rounded: + updated_values = np.round(current_values) + warnings.warn( + "The values of tensor {} can't be represented " + "with the set FINN datatype ({}), they will be rounded to match the " + "FINN datatype.".format(tensor, dtype) + ) + # check if rounded values are not too far from original values + max_error = max(np.abs(current_values - updated_values).flatten()) + if max_error <= get_execution_error_thresh(): + if check_values is True: + # check again if values can now be represented with set finn datatype + # TODO: vectorize with numpy + for value in np.nditer(updated_values): + if not dtype.allowed(value): + raise Exception( + """Values can't be represented with set + finn datatype ({}) for input {}""".format( + dtype, tensor + ) + ) + execution_context[tensor] = updated_values + else: + raise Exception( + """Rounding error is too high to match set FINN + datatype ({}) for input {}""".format( + dtype, tensor + ) + ) + return execution_context + + class CppBuilder: """Builds the g++ compiler command to produces the executable of the c++ code in code_gen_dir which is passed to the function build() of this class.""" diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py index 0f82c52cb2c1fc5ee4ed5a1927f46e222e0ab9b5..6b6df3940cfeeed292345382471719c49f725de6 100644 --- a/src/finn/util/vivado.py +++ b/src/finn/util/vivado.py @@ -28,6 +28,7 @@ import os import subprocess +import stat from finn.util.basic import get_remote_vivado @@ -91,6 +92,7 @@ def out_of_context_synth( vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name) res_counts_path = vivado_proj_folder + "/res.txt" if remote_server is not None: + print("Using remote Vivado OOC synth, remote server %s" % remote_server) run_synth = """ #!/bin/bash which vivado; @@ -105,14 +107,17 @@ cat %s ) with open(vivado_proj_folder + "/run.sh", "w") as f: f.write(run_synth) + st = os.stat(vivado_proj_folder + "/run.sh") + os.chmod(vivado_proj_folder + "/run.sh", st.st_mode | stat.S_IEXEC) # note that this assumes the same temp folder can be created on the # remote server - remote_server_uri = remote_server + ":" + verilog_dir - copy_files = "rsync -avz %s %s" % (verilog_dir + "/", remote_server_uri + "/") + # note we set target path as / due to use of -R (relative) + remote_server_uri = remote_server + ":/" + copy_files = "rsync -avzR %s %s" % (verilog_dir + "/", remote_server_uri) copy_files = copy_files.split() proc = subprocess.Popen(copy_files, cwd=verilog_dir, env=os.environ) proc.communicate() - vivado_cmd = "bash %s/run.sh" % vivado_proj_folder + vivado_cmd = "bash -ic %s/run.sh" % vivado_proj_folder run_vivado = ["ssh", "-t", remote_server, vivado_cmd] proc = subprocess.Popen(run_vivado, cwd=verilog_dir, env=os.environ) proc.communicate() diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py new file mode 100644 index 0000000000000000000000000000000000000000..24854a2153df9af78feb8352ca119e831a9ac9eb --- /dev/null +++ b/tests/brevitas/test_brevitas_avg_pool_export.py @@ -0,0 +1,103 @@ +import os + +import onnx # noqa +import torch +import numpy as np +import brevitas.onnx as bo +from brevitas.nn import QuantAvgPool2d +from brevitas.quant_tensor import pack_quant_tensor +from brevitas.core.quant import QuantType +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.util.basic import gen_finn_dt_tensor +import finn.core.onnx_exec as oxe + +import pytest + +export_onnx_path = "test_avg_pool.onnx" + + +@pytest.mark.parametrize("kernel_size", [2, 3]) +@pytest.mark.parametrize("stride", [1, 2]) +@pytest.mark.parametrize("signed", [False, True]) +@pytest.mark.parametrize("bit_width", [2, 4]) +@pytest.mark.parametrize("input_bit_width", [4, 8, 32]) +@pytest.mark.parametrize("channels", [2, 4]) +@pytest.mark.parametrize("idim", [7, 8]) +def test_brevitas_avg_pool_export( + kernel_size, stride, signed, bit_width, input_bit_width, channels, idim +): + ishape = (1, channels, idim, idim) + ibw_tensor = torch.Tensor([input_bit_width]) + + b_avgpool = QuantAvgPool2d( + kernel_size=kernel_size, + stride=stride, + signed=signed, + min_overall_bit_width=bit_width, + max_overall_bit_width=bit_width, + quant_type=QuantType.INT, + ) + # call forward pass manually once to cache scale factor and bitwidth + input_tensor = torch.from_numpy(np.zeros(ishape)).float() + scale = np.ones((1, channels, 1, 1)) + output_scale = torch.from_numpy(scale).float() + input_quant_tensor = pack_quant_tensor( + tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor + ) + bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor) + model = ModelWrapper(export_onnx_path) + + # determine input FINN datatype + if signed is True: + prefix = "INT" + else: + prefix = "UINT" + dt_name = prefix + str(input_bit_width // 2) + dtype = DataType[dt_name] + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + # execution with input tensor using integers and scale = 1 + # calculate golden output + inp = gen_finn_dt_tensor(dtype, ishape) + input_tensor = torch.from_numpy(inp).float() + input_quant_tensor = pack_quant_tensor( + tensor=input_tensor, scale=output_scale, bit_width=ibw_tensor + ) + b_avgpool.eval() + expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy() + + # finn execution + idict = {model.graph.input[0].name: inp} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + assert (expected == produced).all() + + # execution with input tensor using float and scale != 1 + scale = np.random.uniform(low=0, high=1, size=(1, channels, 1, 1)).astype( + np.float32 + ) + inp_tensor = inp * scale + input_tensor = torch.from_numpy(inp_tensor).float() + input_scale = torch.from_numpy(scale).float() + input_quant_tensor = pack_quant_tensor( + tensor=input_tensor, scale=input_scale, bit_width=ibw_tensor + ) + # export again to set the scale values correctly + bo.export_finn_onnx(b_avgpool, ishape, export_onnx_path, input_t=input_quant_tensor) + model = ModelWrapper(export_onnx_path) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + b_avgpool.eval() + expected = b_avgpool.forward(input_quant_tensor).tensor.detach().numpy() + # finn execution + idict = {model.graph.input[0].name: inp_tensor} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + + assert np.isclose(expected, produced).all() + + os.remove(export_onnx_path) diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py index c9d8f2d812bc7bea1a2fd2598a7711099ad421e6..c5ddad12ca3e8d353682fbb20449d44358485f69 100644 --- a/tests/brevitas/test_brevitas_relu_act_export.py +++ b/tests/brevitas/test_brevitas_relu_act_export.py @@ -23,6 +23,7 @@ export_onnx_path = "test_act.onnx" def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type): min_val = -1.0 ishape = (1, 15) + b_act = QuantReLU( bit_width=abits, max_val=max_val, @@ -67,3 +68,60 @@ scaling_impl.learned_value": torch.tensor( assert np.isclose(produced, expected, atol=1e-3).all() os.remove(export_onnx_path) + + +@pytest.mark.parametrize("abits", [1, 2, 4, 8]) +@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)]) +@pytest.mark.parametrize("scaling_per_channel", [True, False]) +def test_brevitas_act_export_relu_imagenet(abits, max_val, scaling_per_channel): + out_channels = 32 + ishape = (1, out_channels, 1, 1) + min_val = -1.0 + b_act = QuantReLU( + bit_width=abits, + quant_type=QuantType.INT, + scaling_impl_type=ScalingImplType.PARAMETER, + scaling_per_channel=scaling_per_channel, + restrict_scaling_type=RestrictValueType.LOG_FP, + scaling_min_val=2e-16, + max_val=6.0, + return_quant_tensor=True, + per_channel_broadcastable_shape=(1, out_channels, 1, 1), + ) + if scaling_per_channel is True: + rand_tensor = (2) * torch.rand((1, out_channels, 1, 1)) + else: + rand_tensor = torch.tensor(1.2398) + checkpoint = { + "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\ +scaling_impl.learned_value": rand_tensor.type( + torch.FloatTensor + ) + } + b_act.load_state_dict(checkpoint) + bo.export_finn_onnx(b_act, ishape, export_onnx_path) + model = ModelWrapper(export_onnx_path) + model = model.transform(InferShapes()) + inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype( + np.float32 + ) + idict = {model.graph.input[0].name: inp_tensor} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + inp_tensor = torch.from_numpy(inp_tensor).float() + b_act.eval() + expected = b_act.forward(inp_tensor).tensor.detach().numpy() + if not np.isclose(produced, expected, atol=1e-3).all(): + print(abits, max_val) + print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach()) + if abits < 5: + print( + "thres:", + ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]), + ) + print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]])) + print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]])) + print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]])) + + assert np.isclose(produced, expected, atol=1e-3).all() + os.remove(export_onnx_path) diff --git a/tests/core/test_basic_onnx_exec.py b/tests/core/test_basic_onnx_exec.py index a7b6da9965aa5912870812a8c1f8d6da2ee0d181..ddb2cbfc40c7647970f0c51ecb95340e7d1dddae 100644 --- a/tests/core/test_basic_onnx_exec.py +++ b/tests/core/test_basic_onnx_exec.py @@ -35,6 +35,8 @@ import onnx.numpy_helper as np_helper import finn.core.onnx_exec as oxe from finn.core.modelwrapper import ModelWrapper from finn.transformation.infer_shapes import InferShapes +from finn.core.datatype import DataType +from finn.util.basic import gen_finn_dt_tensor def test_mnist_onnx_download_extract_run(): @@ -47,9 +49,50 @@ def test_mnist_onnx_download_extract_run(): raw_o = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/output_0.pb") input_tensor = onnx.load_tensor_from_string(raw_i) output_tensor = onnx.load_tensor_from_string(raw_o) - # run using FINN-based execution + # run using FINN-based execution (full graph) input_dict = {"Input3": np_helper.to_array(input_tensor)} - output_dict = oxe.execute_onnx(model, input_dict) + output_dict = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) assert np.isclose( np_helper.to_array(output_tensor), output_dict["Plus214_Output_0"], atol=1e-3 ).all() + # test subgraph execution + start_node = model.graph.node[1] + end_node = model.graph.node[3] + subgraph_i_dict = {start_node.input[0]: output_dict[start_node.input[0]]} + subgraph_o_dict = oxe.execute_onnx( + model, + subgraph_i_dict, + return_full_exec_context=True, + start_node=start_node, + end_node=end_node, + ) + assert np.isclose( + subgraph_o_dict[end_node.output[0]], output_dict[end_node.output[0]], atol=1e-3 + ).all() + + +def test_onnx_exec_internal_rounding(): + inp0 = onnx.helper.make_tensor_value_info("inp0", onnx.TensorProto.FLOAT, [2, 2]) + inp1 = onnx.helper.make_tensor_value_info("inp1", onnx.TensorProto.FLOAT, [1]) + outp = onnx.helper.make_tensor_value_info("outp", onnx.TensorProto.FLOAT, [2, 2]) + mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"]) + graph = onnx.helper.make_graph( + nodes=[mul_node], name="mul_graph", inputs=[inp0, inp1], outputs=[outp] + ) + + model = onnx.helper.make_model(graph, producer_name="mul-model") + model = ModelWrapper(model) + idt = DataType.INT2 + model.set_tensor_datatype("inp0", idt) + model.set_tensor_datatype("inp1", idt) + model.transform(InferShapes()) + + mul_value = np.asarray([-1], dtype=np.float32) + inp_int = gen_finn_dt_tensor(idt, [2, 2]) + scale = np.random.uniform(low=0, high=1, size=(2, 2)).astype(np.float32) + inp_rounded = (inp_int * scale) / (scale + 1e-7) + input_dict = {"inp0": inp_rounded, "inp1": mul_value} + output_dict = oxe.execute_onnx(model, input_dict) + produced = output_dict["outp"] + expected = np.multiply(inp_int, mul_value) + assert (produced == expected).all() diff --git a/tests/end2end/test_end2end_cnv_w2a2.py b/tests/end2end/test_end2end_cnv_w2a2.py new file mode 100644 index 0000000000000000000000000000000000000000..31ccebd4c175ad2badef17499bf113d978b637f7 --- /dev/null +++ b/tests/end2end/test_end2end_cnv_w2a2.py @@ -0,0 +1,377 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os + +import numpy as np + +# as of Feb'20 there is a bug that segfaults ONNX shape inference if we +# import pytorch before onnx, so we make sure to import onnx first +import onnx # NOQA + +import pytest +import pkg_resources as pk +from finn.custom_op.registry import getCustomOp +from finn.core.onnx_exec import execute_onnx +from finn.transformation.double_to_single_float import DoubleToSingleFloat +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.move_reshape import RemoveCNVtoFCFlatten +from finn.transformation.fold_constants import FoldConstants +from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.transformation.streamline import Streamline +from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul +import finn.transformation.streamline.absorb as absorb +from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject +from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject +from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.util.basic import pynq_part_map +from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip +from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.core.throughput_test import throughput_test_rtlsim + +build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 +mem_mode = "decoupled" + + +def test_end2end_cnv_w2a2_export(): + import brevitas.onnx as bo + + cnv = get_test_model_trained("CNV", 2, 2) + bo.export_finn_onnx( + cnv, (1, 3, 32, 32), build_dir + "/end2end_cnv_w2a2_export.onnx" + ) + + +def test_end2end_cnv_w2a2_import_and_tidy(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_export.onnx") + model = model.transform(DoubleToSingleFloat()) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir + "/end2end_cnv_w2a2_tidy.onnx") + + +def test_end2end_cnv_w2a2_streamline(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_tidy.onnx") + model = model.transform(Streamline()) + model = model.transform(LowerConvsToMatMul()) + model = model.transform(MakeMaxPoolNHWC()) + model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(Streamline()) + model.save(build_dir + "/end2end_cnv_w2a2_streamlined.onnx") + + +def test_end2end_cnv_w2a2_convert_to_hls_layers(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_streamlined.onnx" + ) + model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode)) + model = model.transform(to_hls.InferConvInpGen()) + model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(RemoveCNVtoFCFlatten()) + model.save(build_dir + "/end2end_cnv_w2a2_hls_layers.onnx") + + +def test_end2end_cnv_w2a2_create_dataflow_partition(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_hls_layers.onnx" + ) + parent_model = model.transform(CreateDataflowPartition()) + parent_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx") + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename) + dataflow_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx") + + +def test_end2end_cnv_w2a2_fold_and_tlastmarker(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx" + ) + fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") + # each tuple is (PE, SIMD, in_fifo_depth) for a layer + folding = [ + (8, 3, 256, "auto"), + (16, 16, 256, "auto"), + (8, 16, 256, "auto"), + (8, 16, 256, "block"), + (4, 8, 214, "auto"), + (1, 8, 2, "auto"), + (1, 2, 126, "distributed"), + (2, 2, 62, "block"), + (5, 1, 6, "distributed"), + ] + for fcl, (pe, simd, ififodepth, ramstyle) in zip(fc_layers, folding): + fcl_inst = getCustomOp(fcl) + fcl_inst.set_nodeattr("PE", pe) + fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("inFIFODepth", ififodepth) + fcl_inst.set_nodeattr("ram_style", ramstyle) + + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_idepth = [2, 51, 9, 106, 2, 2] + for i in range(len(swg_layers)): + swg_inst = getCustomOp(swg_layers[i]) + simd = folding[i][1] + swg_inst.set_nodeattr("SIMD", simd) + swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i]) + + model = model.transform(InsertDWC()) + model = model.transform(InsertFIFO()) + model = model.transform(InsertTLastMarker()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(AnnotateResources("estimate")) + model.save(build_dir + "/end2end_cnv_w2a2_folded.onnx") + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_cnv_w2a2_gen_hls_ip(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_folded.onnx") + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(AnnotateResources("hls")) + model.save(build_dir + "/end2end_cnv_w2a2_ipgen.onnx") + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_ip_stitch(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen.onnx") + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model.save(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx") + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_verify_dataflow_part(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx") + x = np.zeros((1, 32, 32, 3), dtype=np.float32) + inp_name = model.graph.input[0].name + out_name = model.graph.output[0].name + inp_dict = {inp_name: x} + # cppsim + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + model.save(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx") + ret_cppsim = execute_onnx(model, inp_dict, True) + res_cppsim = ret_cppsim[out_name] + # node-by-node rtlsim + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + model.save(build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx") + ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True) + res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name] + # whole-network (ip-stitched) rtlsim + model.set_metadata_prop("exec_mode", "rtlsim") + model.save(build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx") + # this is a particularly long-running test, set liveness thr. to unlimited + os.environ["LIVENESS_THRESHOLD"] = "-1" + ret_rtlsim_whole = execute_onnx(model, inp_dict, True) + res_rtlsim_whole = ret_rtlsim_whole[out_name] + assert np.isclose(res_cppsim, res_rtlsim_nodebynode).all() + assert np.isclose(res_cppsim, res_rtlsim_whole).all() + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_throughput_test_rtlsim(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx" + ) + model.set_metadata_prop("rtlsim_trace", "rtlsim_trace.vcd") + # os.environ["RTLSIM_TRACE_DEPTH"] = "4" + # run through IP-stitched rtlsim with increasing batch sizes and + # check the number of cycles it takes to execute + ret = throughput_test_rtlsim(model, 10) + # TODO check for expected performance + assert ret["cycles"] > 0 + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_verify_all(): + # use the streamlined model as the "golden" model for right answers + golden = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_streamlined.onnx" + ) + iname = golden.graph.input[0].name + oname = golden.graph.output[0].name + # load one of the test vectors + fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz") + input_tensor = np.load(fn)["arr_0"].astype(np.float32) + input_tensor = input_tensor / 255 + assert input_tensor.shape == (1, 3, 32, 32) + x = input_tensor + # x = np.zeros(ishape, dtype=np.float32) + ret_golden = execute_onnx(golden, {iname: x}, True) + y_golden = ret_golden[oname] + # set up parent+child graph to test + # we'll use models from the previous step as the child model + parent_model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx" + ) + iname = parent_model.graph.input[0].name + oname = parent_model.graph.output[0].name + # produce results with cppsim + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx") + sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx") + ret_cppsim = execute_onnx(parent_model, {iname: x}, True) + y_cppsim = ret_cppsim[oname] + # produce results with node-by-node rtlsim + load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx" + ) + sdp_node.set_nodeattr( + "model", build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx" + ) + ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True) + y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname] + # produce results with whole-network (stitched ip) rtlsim + load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx" + ) + sdp_node.set_nodeattr( + "model", build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx" + ) + # this is a particularly long-running test, set liveness thr. to unlimited + os.environ["LIVENESS_THRESHOLD"] = "-1" + ret_whole_rtlsim = execute_onnx(parent_model, {iname: x}, True) + y_whole_rtlsim = ret_whole_rtlsim[oname] + assert np.isclose(y_golden, y_cppsim).all() + assert np.isclose(y_golden, y_nodebynode_rtlsim).all() + assert np.isclose(y_golden, y_whole_rtlsim).all() + assert np.argmax(y_golden) == 3 + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_make_pynq_proj(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx") + model = model.transform(MakePYNQProject(test_pynq_board)) + model.save(build_dir + "/end2end_cnv_w2a2_pynq_project.onnx") + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_cnv_w2a2_synth_pynq_project(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_pynq_project.onnx" + ) + model = model.transform(SynthPYNQProject()) + model = model.transform(AnnotateResources("synth")) + model.save(build_dir + "/end2end_cnv_w2a2_synth.onnx") + + +def test_end2end_cnv_w2a2_make_driver(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_synth.onnx") + model = model.transform(MakePYNQDriver()) + model.save(build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx") + + +def test_end2end_cnv_w2a2_deploy_on_pynq(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx" + ) + try: + ip = os.environ["PYNQ_IP"] # no fault for this one; skip if not defined + if ip == "": + pytest.skip("PYNQ board IP address not specified") + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + # save the model to be able to link it to the parent + model.save(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx") + except KeyError: + pytest.skip("PYNQ board IP address not specified") + + +def test_end2end_cnv_w2a2_run_on_pynq(): + # use the streamlined model as the "golden" model for right answers + golden = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_streamlined.onnx" + ) + iname = golden.graph.input[0].name + oname = golden.graph.output[0].name + # load one of the test vectors + fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz") + input_tensor = np.load(fn)["arr_0"].astype(np.float32) + input_tensor = input_tensor / 255 + assert input_tensor.shape == (1, 3, 32, 32) + x = input_tensor + # run using FINN-based execution + ret_golden = execute_onnx(golden, {iname: x}, True) + y_golden = ret_golden[oname] + # set up parent+child graph to test + # we'll use models from the previous step as the child model + parent_model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx" + ) + iname = parent_model.graph.input[0].name + oname = parent_model.graph.output[0].name + try: + ip = os.environ["PYNQ_IP"] # NOQA + if ip == "": + pytest.skip("PYNQ board IP address not specified") + # produce results with cppsim + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx") + sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx") + ret = execute_onnx(parent_model, {iname: x}, True) + y = ret[oname] + assert np.isclose(y, y_golden).all() + assert np.argmax(y) == 3 + + except KeyError: + pytest.skip("PYNQ board IP address not specified") diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f78dcea1a1ce364d0657ad64de7d440d41b822 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py @@ -0,0 +1,160 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from onnx import TensorProto, helper +import numpy as np +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.general import GiveUniqueNodeNames +from finn.custom_op.registry import getCustomOp +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.infer_shapes import InferShapes + + +def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt): + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim] + ) + + mp_node = helper.make_node( + "MaxPool", + ["inp"], + ["outp"], + kernel_shape=[k, k], + pads=[pad, pad, pad, pad], + strides=[stride, stride], + ) + graph = helper.make_graph( + nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="mp-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model = model.transform(InferShapes()) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4]) +# pool configuration: ( k,stride, pad, ifm_dim ) +@pytest.mark.parametrize( + "pool_config", [(3, 2, 0, 5), (3, 2, 1, 5), (2, 2, 0, 8), (5, 2, 2, 7)] +) +# input channels +@pytest.mark.parametrize("ifm_ch", [1, 4, 20]) +# number of out channel computed in parallel +@pytest.mark.parametrize("pe", [1, 4, 20]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# pool type +@pytest.mark.parametrize("op_type", ["MaxPool"]) +@pytest.mark.slow +@pytest.mark.vivado +def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_type): + k, stride, pad, ifm_dim = pool_config + + if ifm_ch % pe != 0: + pytest.skip("ifm_ch%pe != 0. Skipping") + + if pad != 0 and idt.signed(): + pytest.skip("No support for pal_val != 0. Skipping") + + np.random.seed(0) + ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1) + + x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim)) + # prepare input data + input_dict = prepare_inputs(x) + if op_type == "MaxPool": + model = make_single_maxpool_modelwrapper( + k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt + ) + else: + assert False, "{} is not a supported op_type".format(op_type) + + y_expected = oxe.execute_onnx(model, input_dict)["outp"] + + new_model = model.transform(to_hls.InferPool_Batch()) + new_model = new_model.transform(GiveUniqueNodeNames()) + + if ifm_ch != pe: + new_model = new_model.transform(to_hls.InferConvInpGen()) + # Folding + for n in new_model.graph.node: + if n.op_type == "ConvolutionInputGenerator": + inst = getCustomOp(n) + inst.set_nodeattr("SIMD", pe) + elif n.op_type == "Pool_Batch": + inst = getCustomOp(n) + inst.set_nodeattr("PE", pe) + + if exec_mode == "cppsim": + new_model = new_model.transform(SetExecMode("cppsim")) + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # execute new_model + y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] + assert (y_produced == y_expected).all() + if stride != k: + if pad == 0 or ifm_ch == pe: + assert len(new_model.graph.node) == 4 + else: + assert len(new_model.graph.node) == 5 + else: + assert len(new_model.graph.node) == 1 diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 9d6390b2673e5d2c0e72748183ac04ed222d078e..5ff3da87228a2a32a41226bb46e0b16b1a44df50 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -23,7 +23,7 @@ test_fpga_part = pynq_part_map[test_pynq_board] target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style): assert pad_style == 2, "only pad_style == 2 supported in hlslib" assert padding > 0, "Output dim should be greater than input dim" odim = idim + padding @@ -47,6 +47,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): inputDataType=str(idt.name), PaddingStyle=pad_style, numInputVectors=1, + SIMD=simd, ) graph = helper.make_graph( @@ -63,11 +64,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): # input image dimension -@pytest.mark.parametrize("idim", [8, 16]) +@pytest.mark.parametrize("idim", [8]) # number of rows and number of cols to add @pytest.mark.parametrize("pad", [2, 3]) # number of channels -@pytest.mark.parametrize("num_ch", [1, 2]) +@pytest.mark.parametrize("num_ch", [2, 4]) +# Input parallelism +@pytest.mark.parametrize("simd", [1, 2]) # PaddingStyle: selects behavior when (odim-idim)%2 != 0 @pytest.mark.parametrize("pad_style", [2]) # FINN input datatype @@ -76,14 +79,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode): - +def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): + if num_ch % simd != 0: + pytest.skip(" num_ch % simd != 0, skipping") # generate input data x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) input_dict = {"inp": x} odim = idim + pad - model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style) + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py similarity index 99% rename from tests/fpgadataflow/test_fpgadataflow_ip_stitch.py rename to tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 61dd81b728aafcd8ccc812cf0cb4c27eff00f471..b830693c32afe629dd6fc70868d0bddacac4c887 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -300,7 +300,7 @@ def test_fpgadataflow_ipstitch_synth_ooc(): assert ret["FF"] > 0 assert ret["DSP"] == 0 assert ret["BRAM"] == 0 - assert ret["fmax_mz"] > 100 + assert ret["fmax_mhz"] > 100 @pytest.mark.vivado diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index 38f792ed3cdd52044b28b4c19ac0603da4e502e6..398a17132a2ef6c92e600102ff5c0b71a1f65aaa 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -92,7 +92,7 @@ def test_res_estimate(): model = model.transform(GiveUniqueNodeNames()) prod_resource_estimation = model.analysis(res_estimation) expect_resource_estimation = { - "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, "LUT": 304.4} + "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, 'BRAM_efficiency': 0.001736111111111111, "LUT": 304.4} } assert check_two_dict_for_equality( diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py index 66a93a190061e0142637be19bb2ea841d192745a..3b6ea86741b8adefce4faaa65b791f1d213cf3ae 100644 --- a/tests/pynq/test_pynq_performance_end2end.py +++ b/tests/pynq/test_pynq_performance_end2end.py @@ -10,7 +10,7 @@ from finn.core.throughput_test import throughput_test build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] -@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"]) +@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1", "cnv_w2a2"]) @pytest.mark.slow def test_pynq_performance_end2end(end2end_example): model = load_test_checkpoint_or_skip( diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py index 56dcd26076ec0a5fba6e9be6acac7f5e13572c3d..103967dfb6b86cc6e2ce2bc9ab78249d8945d47d 100644 --- a/tests/transformation/streamline/test_streamline_cnv.py +++ b/tests/transformation/streamline/test_streamline_cnv.py @@ -44,9 +44,9 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat export_onnx_path = make_build_dir("test_streamline_cnv_") # act bits -@pytest.mark.parametrize("abits", [1]) +@pytest.mark.parametrize("abits", [1, 2]) # weight bits -@pytest.mark.parametrize("wbits", [1]) +@pytest.mark.parametrize("wbits", [1, 2]) # network topology / size @pytest.mark.parametrize("size", ["CNV"]) def test_streamline_cnv(size, wbits, abits): @@ -74,6 +74,7 @@ def test_streamline_cnv(size, wbits, abits): # model.save("orig_cnv.onnx") model = model.transform(Streamline()) # model.save("streamlined_cnv.onnx") + assert len(model.graph.node) == 23 produced_ctx = oxe.execute_onnx(model, input_dict, True) produced = produced_ctx[model.graph.output[0].name] assert np.isclose(expected, produced, atol=1e-3).all() diff --git a/tests/transformation/test_absorb_mul_into_topk.py b/tests/transformation/test_absorb_mul_into_topk.py new file mode 100644 index 0000000000000000000000000000000000000000..1394220f7c336ccea8fe9c494734c4175bf2e847 --- /dev/null +++ b/tests/transformation/test_absorb_mul_into_topk.py @@ -0,0 +1,108 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.transformation.insert_topk import InsertTopK +from finn.transformation.streamline.absorb import AbsorbScalarMulIntoTopK +import finn.core.onnx_exec as oxe + +# parameter to indicate if mul parameter is negative or positive +@pytest.mark.parametrize("mul_positive", [True, False]) +# parameter to indicate if mul parameter is scalar or not +@pytest.mark.parametrize("scalar", [True, False]) +def test_absorb_mul_into_topk(mul_positive, scalar): + if scalar is True: + shape = [1] + else: + shape = [1, 1, 1, 1000] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 1, 1, 1000]) + a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, 1000]) + + mul_node = helper.make_node("Mul", ["inp", "a0"], ["outp"]) + mul_graph = helper.make_graph( + nodes=[mul_node], + name="mul-graph", + inputs=[inp], + outputs=[outp], + value_info=[a0], + ) + + model = helper.make_model(mul_graph, producer_name="mul_model") + model = ModelWrapper(model) + # initialize values + if mul_positive is True: + a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype( + np.float32 + ) + else: + a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype( + np.float32 + ) + model.set_initializer("a0", a0_values) + model = model.transform(InsertTopK()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model_transformed = model.transform(AbsorbScalarMulIntoTopK()) + + # compare execution results + inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype( + np.float32 + ) + idict = {"global_in": inp_values} + odict = oxe.execute_onnx(model, idict, True) + y_indices = odict["global_out"] + y_values = odict["TopK_0_out0"] + odict = oxe.execute_onnx(model_transformed, idict, True) + y_tr_indices = odict["global_out"] + y_tr_values = odict["TopK_0_out0"] + + # the indices stay the same, if the model is transformed or not + assert (y_indices == y_tr_indices).all() + + if scalar is True and mul_positive is True: + # the values change if the model was transformed + assert (y_values != y_tr_values).all() + + # check for new order + assert model.graph != model_transformed.graph + assert len(model.graph.node) - 1 == len(model_transformed.graph.node) + assert model_transformed.graph.node[0].op_type == "TopK" + + else: + assert (y_values == y_tr_values).all() + assert model.graph == model_transformed.graph diff --git a/tests/transformation/test_change_datalayout.py b/tests/transformation/test_change_datalayout.py new file mode 100644 index 0000000000000000000000000000000000000000..66459d574957575e61ec1bec631fb7030a27cca1 --- /dev/null +++ b/tests/transformation/test_change_datalayout.py @@ -0,0 +1,112 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest +from onnx import helper, TensorProto + +from finn.custom_op.maxpoolnhwc import compute_pool_output_dim +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +import finn.core.data_layout as DataLayout +from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.util.basic import gen_finn_dt_tensor +from finn.util.basic import get_by_name +import finn.core.onnx_exec as oxe + +# stride +@pytest.mark.parametrize("s", [1, 2]) +# kernel +@pytest.mark.parametrize("k", [3, 4]) +# ibits +@pytest.mark.parametrize("ibits", [4, 8]) +# obits +@pytest.mark.parametrize("obits", [2, 4]) +# signed +@pytest.mark.parametrize("signed", [False, True]) +# channels +@pytest.mark.parametrize("c", [2, 3]) +# input dimension +@pytest.mark.parametrize("idim", [6, 7]) +def test_change_datalayout_quantavgpool(s, k, ibits, obits, signed, c, idim): + n = 1 + odim = compute_pool_output_dim(idim, k, s) + # determine input FINN datatype + if signed is True: + prefix = "INT" + else: + prefix = "UINT" + dt_name = prefix + str(ibits) + dtype = DataType[dt_name] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [n, c, idim, idim]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [n, c, odim, odim]) + + node = helper.make_node( + "QuantAvgPool2d", + ["inp"], + ["outp"], + domain="finn", + stride=s, + kernel=k, + ibits=ibits, + obits=obits, + signed=signed, + data_layout="NCHW", + ) + graph = helper.make_graph( + nodes=[node], name="single-quantavgpool", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph) + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model_transformed = model.transform(ChangeDataLayoutQuantAvgPool2d()) + model_transformed = model_transformed.transform(InferShapes()) + model_transformed = model_transformed.transform(InferDataTypes()) + model_transformed = model_transformed.transform(InferDataLayouts()) + model_transformed = model_transformed.transform(GiveUniqueNodeNames()) + model_transformed = model_transformed.transform(GiveReadableTensorNames()) + inp_values = gen_finn_dt_tensor(dtype, [n, c, idim, idim]) + idict = {"inp": inp_values} + assert oxe.compare_execution(model, model_transformed, idict) + assert len(model.graph.node) + 2 == len(model_transformed.graph.node) + assert model_transformed.graph.node[-1].op_type == "Transpose" + assert model_transformed.graph.node[0].op_type == "Transpose" + # check if QuantAvgPool2d node has datalayout set correctly + node = model_transformed.graph.node[1] + d_layout = get_by_name(node.attribute, "data_layout").s.decode("UTF-8") + assert d_layout == "NHWC" + assert model_transformed.get_tensor_layout(node.input[0]) == DataLayout.NHWC + assert model_transformed.get_tensor_layout(node.output[0]) == DataLayout.NHWC diff --git a/tests/transformation/test_conv_lowering.py b/tests/transformation/test_conv_lowering.py index 2cbc8e558940517168678b05c3bb46af8170abce..73891ded1b9691c7c48a2075ad6ca4668fcf6bfe 100644 --- a/tests/transformation/test_conv_lowering.py +++ b/tests/transformation/test_conv_lowering.py @@ -26,12 +26,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import onnx.helper as oh +from onnx import TensorProto import os import pkg_resources as pk import brevitas.onnx as bo import numpy as np - from finn.core.modelwrapper import ModelWrapper from finn.transformation.fold_constants import FoldConstants from finn.transformation.infer_shapes import InferShapes @@ -65,3 +66,51 @@ def test_conv_lowering_cnv_w1a1(): assert np.isclose(produced, expected).all() assert np.argmax(produced) == 3 os.remove(export_onnx_path) + + +def test_conv_lowering_conv_1x1(): + np.random.seed(0) + + in_feature_dim = 7 + in_chn = 3 + kernel_size = 1 + out_feature_dim = in_feature_dim + + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, in_chn, out_feature_dim, out_feature_dim] + + conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = 1 + conv_config["kernel_shape"] = [kernel_size, kernel_size] + conv_config["pads"] = [0, 0, 0, 0] + conv_config["strides"] = [1, 1] + + top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + + value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)] + + modelproto = oh.make_model( + oh.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[oh.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + model.set_initializer("p1", np.random.rand(*conv_param_shape).astype(np.float32)) + + new_model = model.transform(LowerConvsToMatMul()) + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + + assert oxe.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "Transpose" + assert new_model.graph.node[1].op_type == "MatMul" + assert new_model.graph.node[2].op_type == "Transpose" + assert len(new_model.graph.node) == 3 diff --git a/tests/transformation/test_move_maxpool_past_multithreshold.py b/tests/transformation/test_move_maxpool_past_multithreshold.py new file mode 100644 index 0000000000000000000000000000000000000000..2fc19debf8d6fc89d15e3d731f1e54daa491c321 --- /dev/null +++ b/tests/transformation/test_move_maxpool_past_multithreshold.py @@ -0,0 +1,100 @@ +from onnx import TensorProto, helper +import numpy as np + +import finn.core.onnx_exec as oxe +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes + + +def get_multithreshold_rand_params(channels, num_of_thres, seed=None): + if seed is not None: + np.random.seed(seed) + steps = np.random.rand(channels, 1) * 2 + bias = np.random.rand(channels, 1) * 10 + thres = [np.arange(num_of_thres) for chn in range(channels)] + thres = ((thres - bias) * steps).astype(np.float32) + return thres + + +def test_move_maxpool_past_multithreshold(): + # generate test vectors of correct shape + ch = 64 + ifmdim = 16 + ofmdim = 16 // 4 + input_shape = (1, ch, ifmdim, ifmdim) + output_shape = (1, ch, ofmdim, ofmdim) + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + + maxpool_config = {} + maxpool_config["pads"] = [1, 1, 1, 1] + maxpool_config["kernel_shape"] = [3, 3] + maxpool_config["strides"] = [2, 2] + + value_info = [] + thres1_shape = [1, 1] + value_info += [ + helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape) + ] + + thres2_shape = [ch, 14] + value_info += [ + helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape) + ] + + nodes = [] + nodes += [helper.make_node("MaxPool", ["top_in"], ["t1"], **maxpool_config)] + nodes += [ + helper.make_node( + "MultiThreshold", + ["t1", "thres1"], + ["t2"], + domain="finn", + out_dtype="BIPOLAR", + out_bias=-1.0, + out_scale=1.0, + ) + ] + nodes += [helper.make_node("MaxPool", ["t2"], ["t3"], **maxpool_config)] + nodes += [ + helper.make_node( + "MultiThreshold", + ["t3", "thres2"], + ["top_out"], + domain="finn", + out_dtype="UINT4", + ) + ] + + modelproto = helper.make_model( + helper.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=nodes, + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + model.set_initializer("thres1", np.array([[0]])) + model.set_initializer( + "thres2", get_multithreshold_rand_params(*thres2_shape, seed=0) + ) + + # Transform + new_model = model.transform(MoveMaxPoolPastMultiThreshold()) + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + + # Test + assert oxe.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "MaxPool" + assert new_model.graph.node[1].op_type == "MultiThreshold" + assert new_model.graph.node[2].op_type == "MultiThreshold" + assert new_model.graph.node[3].op_type == "MaxPool" + assert len(new_model.graph.node) == 4 diff --git a/tests/transformation/test_move_mul_past_dw_conv.py b/tests/transformation/test_move_mul_past_dw_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..1ae8fbfe89986d58d3d71f5f8735a98469d9d1e3 --- /dev/null +++ b/tests/transformation/test_move_mul_past_dw_conv.py @@ -0,0 +1,93 @@ +import pytest + +from onnx import helper, TensorProto +from finn.custom_op.im2col import compute_conv_output_dim +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.streamline.reorder import MoveMulPastDWConv + + +# input dimension +@pytest.mark.parametrize("ifm_dim", [4, 7]) +# input channels +@pytest.mark.parametrize("ifm_ch", [2, 3]) +# kernel size +@pytest.mark.parametrize("k", [2, 3]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# padding +@pytest.mark.parametrize("pad_amt", [0, 1]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw): + if dw == 1: + ofm_ch = ifm_ch + groups = ifm_ch + W_shape = [ofm_ch, 1, k, k] + else: + ofm_ch = ifm_ch + 2 + groups = 1 + W_shape = [ofm_ch, ifm_ch, k, k] + + ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad_amt) + + # set up onnx model + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [1, ifm_ch, 1, 1]) + W = helper.make_tensor_value_info("W", TensorProto.FLOAT, W_shape) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim] + ) + + Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) + + Conv_node = helper.make_node( + "Conv", + ["mul_out", "W"], + ["outp"], + group=groups, + kernel_shape=[k, k], + pads=[pad_amt, pad_amt, pad_amt, pad_amt], + strides=[stride, stride], + ) + + graph = helper.make_graph( + nodes=[Mul_node, Conv_node], + name="mulpastconv_graph", + inputs=[inp], + outputs=[outp], + value_info=[mul, W], + ) + + model = helper.make_model(graph, producer_name="mulpastconv-model") + model = ModelWrapper(model) + inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim]) + mul_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, 1, 1]) + W_values = gen_finn_dt_tensor(DataType.INT2, W_shape) + model.set_initializer("W", W_values) + model.set_initializer("mul", mul_values) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + idict = {"inp": inp_values} + odict = oxe.execute_onnx(model, idict, True) + out_before = odict["outp"] + + # move channelwise multiplication past depthwise conv + model_transformed = model.transform(MoveMulPastDWConv()) + odict = oxe.execute_onnx(model_transformed, idict, True) + out_after = odict["outp"] + + assert (out_before == out_after).all() + + if dw == 0: + assert model.graph.node[0].op_type == model_transformed.graph.node[0].op_type + assert model.graph.node[1].op_type == model_transformed.graph.node[1].op_type + else: + assert model.graph.node[0].op_type == model_transformed.graph.node[1].op_type + assert model.graph.node[1].op_type == model_transformed.graph.node[0].op_type diff --git a/tests/transformation/test_remove_identity_ops.py b/tests/transformation/test_remove_identity_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..536c1ab0b48fa44388da23f45b528da3c5f3b2f2 --- /dev/null +++ b/tests/transformation/test_remove_identity_ops.py @@ -0,0 +1,81 @@ +import pytest + +import numpy as np +from onnx import helper, TensorProto +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline.remove import RemoveIdentityOps +from finn.util.basic import gen_finn_dt_tensor + + +def insert_identity_op(model, op): + if op in ["Add", "Sub"]: + val = np.asarray([0.0], dtype=np.float32) + elif op in ["Mul", "Div"]: + val = np.asarray([1.0], dtype=np.float32) + else: + return + + identity_node = helper.make_node(op, ["div_out", "value"], ["ident_out"]) + graph = model.graph + graph.node.insert(3, identity_node) + graph.node[-1].input[0] = "ident_out" + model.set_initializer("value", val) + + return model + + +# identity operations to be inserted +@pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"]) +def test_remove_identity_ops(op): + + # set up onnx model + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1]) + mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, []) + shape = helper.make_tensor_value_info("shape", TensorProto.FLOAT, [2]) + div = helper.make_tensor_value_info("div", TensorProto.FLOAT, []) + matmul = helper.make_tensor_value_info("matmul", TensorProto.FLOAT, [4, 2]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 2]) + + mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) + reshape_node = helper.make_node("Reshape", ["mul_out", "shape"], ["reshape_out"]) + div_node = helper.make_node("Div", ["reshape_out", "div"], ["div_out"]) + matmul_node = helper.make_node("MatMul", ["div_out", "matmul"], ["outp"]) + + graph = helper.make_graph( + nodes=[mul_node, reshape_node, div_node, matmul_node], + name="identity-graph", + inputs=[inp], + outputs=[outp], + value_info=[mul, shape, div, matmul], + ) + + model = helper.make_model(graph, producer_name="mulpastconv-model") + model = ModelWrapper(model) + inp_values = gen_finn_dt_tensor(DataType.INT2, [1, 4, 1, 1]) + mul_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32) + shape_values = np.asarray([1, -1], dtype=np.int64) + div_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32) + matmul_values = gen_finn_dt_tensor(DataType.INT2, [4, 2]) + model.set_initializer("mul", mul_values) + model.set_initializer("shape", shape_values) + model.set_initializer("div", div_values) + model.set_initializer("matmul", matmul_values) + insert_identity_op(model, op) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + idict = {"inp": inp_values} + odict = oxe.execute_onnx(model, idict) + out_before = odict["outp"] + num_of_nodes_before = len(model.graph.node) + + model = model.transform(RemoveIdentityOps()) + num_of_nodes_after = len(model.graph.node) + assert num_of_nodes_before - 1 == num_of_nodes_after + + odict = oxe.execute_onnx(model, idict) + out_after = odict["outp"] + assert (out_before == out_after).all()