diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml index cd59a629405c748187cdf478c0bdb0694c58c79f..924fbd24a174df49af4b3e259ad57d0a7907d42b 100644 --- a/.github/workflows/quicktest-dev-pr.yml +++ b/.github/workflows/quicktest-dev-pr.yml @@ -18,4 +18,6 @@ jobs: uses: actions/checkout@v2 - name: DockerRunQuicktest + env: + NUM_DEFAULT_WORKERS: 4 run: sh run-docker.sh quicktest diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci index 5772b16abc8b927def1e2dfbbb8193a2f964f87d..951616651a6d231591413ddd617e9d22ca26d9ef 100644 --- a/docker/Dockerfile.finn_ci +++ b/docker/Dockerfile.finn_ci @@ -61,6 +61,7 @@ RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace RUN pip install -r /workspace/finn/requirements.txt RUN apt update; apt install nano RUN pip install pytest-dependency +RUN pip install pytest-xdist ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src" ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator" diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev index 0e12b504a26ccdb8fd78e162f04cfdeab5a186f1..3c830bf0b5366cc35137c64a496ae3377b144715 100644 --- a/docker/Dockerfile.finn_dev +++ b/docker/Dockerfile.finn_dev @@ -55,6 +55,7 @@ RUN pip install matplotlib RUN pip install pytest-dependency RUN pip install sphinx RUN pip install sphinx_rtd_theme +RUN pip install pytest-xdist # switch user RUN groupadd -g $GID $GNAME diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile index 80be261fb3da057186259598f84d915176577a5d..7ce622c50d8e97032f743a9ce711061fe7f97a0f 100644 --- a/docker/Jenkinsfile +++ b/docker/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { string(name: 'PYNQ_PASSWORD', defaultValue: 'xilinx', description: 'PYNQ board password') string(name: 'PYNQ_TARGET_DIR', defaultValue: '/home/xilinx/finn', description: 'PYNQ board target deployment directory') string(name: 'NUM_DEFAULT_WORKERS', defaultValue: '1', description: 'Number of cores for parallel transformations') - string(name: 'DOCKER_CMD', defaultValue: """python setup.py test""", description: 'Command to run') + string(name: 'DOCKER_CMD', defaultValue: """python setup.py test --addopts "--dist=loadfile -n auto"""", description: 'Command to run') } environment { DOCKER_TAG='finn_ci:$BUILD_ID' diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 0074cce02f7de57dc778e0b671c484233df72a8a..b312737c317517ca0ab19c74cf22284b5977b661 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -13,9 +13,9 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -BREVITAS_COMMIT=026a509186b7e7b0b65d46a2f905043d41069306 +BREVITAS_COMMIT=f9a27226d4acf1661dd38bc449f71f89e0983cce CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716 +HLSLIB_COMMIT=8f9f2018762f654f196b666838aeaf6fc730ad9a PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada diff --git a/docker/quicktest.sh b/docker/quicktest.sh index 4f6a2d3e230de9fcbb947d794722294880a7730d..b6fc30de17d396f7edcb588a1168b3d32a093711 100755 --- a/docker/quicktest.sh +++ b/docker/quicktest.sh @@ -1,4 +1,6 @@ #!/bin/bash +: ${PYTEST_PARALLEL=auto} + cd $FINN_ROOT -python setup.py test --addopts "-m 'not (vivado or slow)'" +python setup.py test --addopts "-m 'not (vivado or slow)' --dist=loadfile -n $PYTEST_PARALLEL" diff --git a/docs/finn/example_networks.rst b/docs/finn/example_networks.rst index 9f221871f09bf655db9d81988d6fa83e53473634..86bb2bd11fd805a23a3bdf6da8a8ed686259ecc1 100644 --- a/docs/finn/example_networks.rst +++ b/docs/finn/example_networks.rst @@ -20,17 +20,17 @@ version, this is indicated by an x mark in the table. +-----------------------+------------+----------+----------+----------+----------+----------+----------+ | Export/Import | x | x | x | x | x | x | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Streamlining | x | x | x | x | x | | | +| Streamlining | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Convert to HLS layers | x | x | x | x | x | | | +| Convert to HLS layers | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Stitched IP | x | x | x | x | x | | | +| Stitched IP | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| Hardware test | x | x | x | | x | | | +| Hardware test | x | x | x | | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| cppsim | x | x | x | x | x | | | +| cppsim | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| rtlsim node-by-node | x | x | x | x | x | | | +| rtlsim node-by-node | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ -| rtlsim stitched IP | x | x | x | x | x | | | +| rtlsim stitched IP | x | x | x | x | x | | x | +-----------------------+------------+----------+----------+----------+----------+----------+----------+ diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 95594bb67a2be3a4c3fbba488c75a704f623c136..f4fa7a13dcbe4fe8ab9667a111df00c605747710 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -18,6 +18,7 @@ Requirements * A working Vivado 2019.1 installation * A `VIVADO_PATH` environment variable pointing to the Vivado installation directory (e.g. the directory where settings64.sh is located) * (optional) A PYNQ board with a network connection + * the ``bitstring`` package must be installed on the PYNQ: ``sudo pip3 install bitstring`` Running FINN in Docker ====================== @@ -30,6 +31,7 @@ Getting an interactive shell for development or experimentation sh run_docker.sh Simply running sh run-docker.sh without any additional arguments will clone the dependency repos, create a Docker container and give you a terminal with you can use for development for experimentation. +If you want a new terminal on an already-running container, you can do this with `docker exec -it finn_dev_<username> bash`. .. warning:: The Docker container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the /workspace/finn folder (which is mounted from the host computer) or otherwise backed up. diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst index 391c6f999312839daca0d4161336c7c0ae822f89..c52c0840aa40566d930164490b1fd249d7c07757 100644 --- a/docs/finn/verification.rst +++ b/docs/finn/verification.rst @@ -28,4 +28,15 @@ This simulation can be used for a model containing several HLS custom operations Emulation using PyVerilator =========================== -The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole design. For that purpose PyVerilator gets the generated verilog files. +The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. + +For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this: + - for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename. + - for IP-stitched rtlsim, set the `rtlsim_trace` metadata_prop for the graph as per above. + +To control the tracing depth in the module hierarchy, use the `RTLSIM_TRACE_DEPTH` environment variable (default is 1): + - level 1 shows top-level input/output streams + - level 2 shows per-layer input/output streams + - level 3 shows per full-layer I/O including FIFO count signals + +Note that deeper tracing will take longer to execute and may produce very large .vcd files. diff --git a/notebooks/end2end_example/tfc_end2end_example.ipynb b/notebooks/end2end_example/tfc_end2end_example.ipynb index d573061487de204084e0d3242da8ad1b791f44d8..c84efc964b1f57b7ed385521fc5214fdc2396590 100644 --- a/notebooks/end2end_example/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/tfc_end2end_example.ipynb @@ -132,7 +132,7 @@ " " ], "text/plain": [ - "<IPython.lib.display.IFrame at 0x7f8890385828>" + "<IPython.lib.display.IFrame at 0x7f7cc4290940>" ] }, "execution_count": 3, @@ -293,7 +293,7 @@ " " ], "text/plain": [ - "<IPython.lib.display.IFrame at 0x7fe1ad0639e8>" + "<IPython.lib.display.IFrame at 0x7f7c6c567f28>" ] }, "execution_count": 6, @@ -333,9 +333,10 @@ " ConvertDivToMul(),\n", " BatchNormToAffine(),\n", " ConvertSignToThres(),\n", + " AbsorbSignBiasIntoMultiThreshold(),\n", " MoveAddPastMul(),\n", " MoveScalarAddPastMatMul(),\n", - " MoveScalarAddPastConv(),\n", + " MoveAddPastConv(),\n", " MoveScalarMulPastMatMul(),\n", " MoveScalarMulPastConv(),\n", " MoveAddPastMul(),\n", @@ -350,6 +351,7 @@ " ]\n", " for trn in streamline_transformations:\n", " model = model.transform(trn)\n", + " model = model.transform(RemoveIdentityOps())\n", " model = model.transform(GiveUniqueNodeNames())\n", " model = model.transform(GiveReadableTensorNames())\n", " model = model.transform(InferDataTypes())\n", @@ -400,7 +402,7 @@ " " ], "text/plain": [ - "<IPython.lib.display.IFrame at 0x7fe1346e4ef0>" + "<IPython.lib.display.IFrame at 0x7f7c6c0bf898>" ] }, "execution_count": 8, @@ -454,7 +456,7 @@ " " ], "text/plain": [ - "<IPython.lib.display.IFrame at 0x7fe1346f7780>" + "<IPython.lib.display.IFrame at 0x7f7c6c0e5c18>" ] }, "execution_count": 9, diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index efdfaa19d9f9e5dfa41911a2184e989337b3d9c2..7c3123cd5eb29a54dc5cbfb912225ad3fdb0f219 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -108,7 +108,9 @@ def execute_node(node, context, graph): context[outp] = output_list[list_ind] -def execute_onnx(model, input_dict, return_full_exec_context=False): +def execute_onnx( + model, input_dict, return_full_exec_context=False, start_node=None, end_node=None +): """Executes given ONNX ModelWrapper with given named inputs. If return_full_exec_context is False, a dict of named outputs is returned @@ -116,7 +118,12 @@ def execute_onnx(model, input_dict, return_full_exec_context=False): If return return_full_exec_context is True, the full set of tensors used by the execution (including inputs, weights, activations and final outputs) - will be returned as a dict.""" + will be returned as a dict. + + When start_node and end_node are set to None, the whole graph is executed. + If they are set to particular ONNX nodes, only the subgraph between (and + including) those nodes is executed. + """ if not model.check_all_tensor_shapes_specified(): raise Exception("Found unspecified tensor shapes, try infer_shapes") @@ -159,7 +166,17 @@ def execute_onnx(model, input_dict, return_full_exec_context=False): # execute the model node by node # we can simply walk down the list since the ONNX spec guarantees that it is # topologically sorted - for node in graph.node: + subgraph = [] + if start_node is None: + start_node = model.graph.node[0] + if end_node is None: + end_node = model.graph.node[-1] + # select the nodes between specified start/end nodes + start_ind = model.get_node_index(start_node) + end_ind = model.get_node_index(end_node) + 1 + assert end_ind >= start_ind, "Start/end nodes must define valid subgraph" + subgraph = graph.node[start_ind:end_ind] + for node in subgraph: if get_sanitize_quant_tensors() != 0: # round input values to match quantization annotation execution_context = sanitize_quant_values( diff --git a/src/finn/custom_op/__init__.py b/src/finn/custom_op/__init__.py index ab6e03bee65b8bf5c4041dd8021b1a561e7673d2..4ae7b9ebffaab6ca6be04b8d73f647b2db22dc78 100644 --- a/src/finn/custom_op/__init__.py +++ b/src/finn/custom_op/__init__.py @@ -56,8 +56,15 @@ class CustomOp(ABC): ret = ret.decode("utf-8") return ret else: - # not set, return default value - return def_val + if req: + raise Exception( + """Required attribute %s unspecified in + a %s node""" + % (name, self.onnx_node.op_type) + ) + else: + # not set, return default value + return def_val except KeyError: raise AttributeError("Op has no such attribute: " + name) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index a688898f4a43b33fd3f07cda12144b84829e451f..71c731f96ca45519c443a5f932ead050770e17de 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -88,6 +88,8 @@ class HLSCustomOp(CustomOp): "res_hls": ("s", False, ""), "res_synth": ("s", False, ""), "rtlsim_so": ("s", False, ""), + # partitioning info + "partition_id": ("i", False, 0), # input and output FIFO depths "inFIFODepth": ("i", False, 2), "outFIFODepth": ("i", False, 2), @@ -171,9 +173,15 @@ class HLSCustomOp(CustomOp): of the node as a dictionary.""" ret = dict() ret["BRAM_18K"] = self.bram_estimation() + ret["BRAM_efficiency"] = self.bram_efficiency_estimation() ret["LUT"] = self.lut_estimation() return ret + def bram_efficiency_estimation(self): + """Function for BRAM efficiency estimation: actual parameter storage + needed divided by the allocated BRAM storage (from estimation)""" + return 1 + def bram_estimation(self): """Function for BRAM resource estimation, is member function of HLSCustomOp class but has to be filled by every node""" @@ -219,7 +227,6 @@ class HLSCustomOp(CustomOp): self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() - template = self.ipgentcl_template for key in self.code_gen_dict: @@ -235,7 +242,7 @@ class HLSCustomOp(CustomOp): def ipgen_extra_directives(self): "Return a list of extra tcl directives for HLS synthesis." return [] - + def ipgen_singlenode_code(self): """Builds the bash script for ip generation using the IPGenBuilder from finn.util.fpgadataflow.""" diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..ad68a4bde29123b2498ac7789048bcd2e13bf3bc --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py @@ -0,0 +1,576 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import ceil +import os + +import numpy as np + +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + rtlsim_output_to_npy, +) +from . import templates + +# ONNX i/o tensor shape assumptions for channelwise ops: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +class ChannelwiseOp_Batch(HLSCustomOp): + """Class that corresponds to finn-hls Thresholding_Batch function. + It can implement a variety of channel-wise parametrized operations, + including Add, Mul and multi-thresholding. + """ + + def __init__(self, onnx_node): + super().__init__(onnx_node) + self.decoupled_wrapper = templates.decoupled_wrapper + + def get_nodeattr_types(self): + my_attrs = { + # channelwise "map" function to apply: + # one of cmp_le, cmp_ge, add, mul + "Func": ("s", False, "cmp_le"), + "PE": ("i", True, 0), + "NumChannels": ("i", True, 0), + # string defining memory resource type for parameters + "ram_style": ("s", False, "distributed"), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "paramDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # input and output FIFO depths + "inFIFODepth": ("i", False, 0), + "outFIFODepth": ("i", False, 0), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM, the depth of the memory used + to store the channelwise op parameters.""" + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return chn // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt_name = self.get_input_datatype().name + exp_idt_name = self.get_nodeattr("inputDataType") + assert exp_idt_name == idt_name, "Bad input DataType for ChannelwiseOp layer" + # TODO: dynamically infer/update odt based on idt as done in ConvertToHLSLayers? + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("paramDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required Threshold_Batch attributes do not exist.""" + ) + + return info_messages + + def bram_estimation(self): + """Calculates BRAM cost if resource set to BRAM""" + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + + if style == "block" and tmem > 1: + return int(ceil(A * P / 16)) * int(ceil(tmem / 1024)) + else: + return 0 + + def lut_estimation(self): + """Calculates LUT cost, taking memory resource type into account """ + # TODO add in/out FIFO contributions + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + # cost of comparators + comparator_cost = A * P + # cost of LUTRAM + if style == "distributed" and tmem > 1: + lutram_cost = P * A * int(ceil(tmem / 64)) + else: + lutram_cost = 0 + # total cost + return comparator_cost + lutram_cost + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + # fill in TSrcI + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_hls_compatible_parameter_tensor(self, orig_param_vector): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure chn % PE == 0 + * interleave rows between PEs + * reshape into (PE, TMEM) and return + """ + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = chn // pe + assert chn % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_param_vector.ndim == 1 + ), """Parameter vector dimension is {}. + Expected dimension: 1.""".format( + orig_param_vector.ndim + ) + + # if not self.get_input_datatype().signed(): + # # ensure all thresholds are nonnegative + # assert (orig_param_vector >= 0).all() + + # ensure all thresholds are integer + assert (orig_param_vector.astype(np.int32) == orig_param_vector).all() + ret = orig_param_vector + + assert ( + ret.shape[0] == chn + ), "Cardinality of parameter vector is not as expected (chn)" + + # distribute rows between PEs + ret = ret.reshape(tmem, pe).transpose() + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + + return ret.reshape(1, pe, tmem) + + def generate_params(self, model, path): + code_gen_dir = path + # save thresholds in params.h + parameters = model.get_initializer(self.onnx_node.input[1]) + parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters) + pdt = DataType[self.get_nodeattr("paramDataType")] + + parameters_hls_code = numpy_to_hls_code( + parameter_tensor, pdt, "parameters", False, True + ) + # get input data type + export_idt = self.get_input_datatype() + if self.get_input_datatype() == DataType.BIPOLAR: + export_idt = DataType.BINARY + idt_hls = export_idt.get_hls_datatype_str() + + # write parameters into params.h + f_params = open("{}/params.h".format(code_gen_dir), "w") + pdt_hls = pdt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType.BIPOLAR: + export_odt = DataType.BINARY + odt_hls = export_odt.get_hls_datatype_str() + # get desired function + func = self.get_nodeattr("Func") + if func == "cmp_le": + func_str = "std::less_equal" + elif func == "cmp_ge": + func_str = "std::greater_equal" + elif func == "add": + func_str = "std::plus" + elif func == "mul": + func_str = "std::multiplies" + else: + raise Exception( + """Invalid value for attribute Func! Is currently set to: {} + has to be set to one of the following value + ("cmp_le", "cmp_ge", "add", "mul")""".format( + func + ) + ) + f_params.write( + "static ChannelWiseOperation<{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + idt_hls, + pdt_hls, + odt_hls, + "%s<%s>" % (func_str, odt_hls), + ) + ) + f_params.write(parameters_hls_code) + f_params.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for ChannelwiseOp_Batch") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType.BIPOLAR: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_folded_output_shape() + ), """Output shape is not as expected""" + # reshape output to have expected shape + oshape = self.get_normal_output_shape() + context[node.output[0]] = context[node.output[0]].reshape(*oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] + + # TODO check and add whatever missing + def defines(self, var): + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = numInputVectors[0] + self.code_gen_dict["$DEFINES$"] = [ + """#define NumChannels1 {}\n#define PE1 {}\n#define numReps {}""".format( + self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + tmpl_args = self.get_template_param_values() + # TODO: why put some template parameters into defines and not others? + # should ImgDim be defined or just filled in here like we do now? + ishape = self.get_folded_input_shape() + if len(ishape) == 3: + imgdim = 1 + elif len(ishape) == 5: + imgdim = ishape[1] + else: + raise Exception("""Unexpeted input shape""") + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}> + (in0, out, threshs, numReps);""".format( + imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"], + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL] + # partition for parallel access along PE and N_PARAMS_PER_CHANNEL + # dimensions (dims 1 and 3) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " + "complete dim=1" + ) + ) + # self.code_gen_dict["$PRAGMAS$"].append( + # ( + # "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " + # "complete dim=3" + # ) + # ) + + # set resource type + ram_style = self.get_nodeattr("ram_style") + pe = self.get_nodeattr("PE") + ich = self.get_nodeattr("NumChannels") + # if PE less than NumChannels, assign cores according to ram_style; + # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs + if pe < ich: + if ram_style == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.parameters " + "core=ROM_2P_LUTRAM" + ) + ) + elif ram_style == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.parameters " + "core=ROM_2P_BRAM" + ) + ) + else: + raise Exception( + """Invalid value for attribute ram_style! Is currently set to: {} + has to be set to one of ("block", "distributed")""".format( + ram_style + ) + ) diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce4379a2c41baa5bc009e9df7623d133ee89a09 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -0,0 +1,297 @@ +import os +import numpy as np +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class DownSampler(HLSCustomOp): + """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function. + Basically performs a down sampling of the image removing rows and columns.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + # spatial size of input images + "ImgDim": ("i", True, 0), + # number of channels in input image + "NumChannels": ("i", True, 0), + # Number of input columns computed in parallel + "SIMD": ("i", False, 1), + "Stride": ("i", True, 2), + # FINN input datatype + "inputDataType": ("s", True, ""), + # Batch size + "numInputVectors": ("i", False, 1), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_downsampled_odim(self): + "Return the down sampled spatial size of the output." + idim = self.get_nodeattr("ImgDim") + stride = self.get_nodeattr("Stride") + return int(np.floor((idim - 1) / stride) + 1) + + def get_normal_input_shape(self): + idim = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + batch = self.get_nodeattr("numInputVectors") + ishape = (batch, idim, idim, num_ch) + return ishape + + def get_normal_output_shape(self): + odim = self.get_downsampled_odim() + num_ch = self.get_nodeattr("NumChannels") + batch = self.get_nodeattr("numInputVectors") + oshape = (batch, odim, odim, num_ch) + return oshape + + def get_folded_input_shape(self): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) + + def get_folded_output_shape(self): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for DownSampler." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + exp_idtype = self.get_input_datatype() + assert dtype == exp_idtype, "Unexpected datatype for DownSampler" + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + return ret + + def get_output_datatype(self): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return ibits * simd + + def get_outstream_width(self): + obits = self.get_output_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return obits * simd + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("ImgDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + simd = self.get_nodeattr("SIMD") + self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)] + + stride = self.get_nodeattr("Stride") + self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision, + IFMDim, SIMD,Stride> (in0, out, numReps);""" + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_hls_type, packed_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py similarity index 88% rename from src/finn/custom_op/fpgadataflow/fmpadding.py rename to src/finn/custom_op/fpgadataflow/fmpadding_batch.py index fa321dfa65d14b67fa218fb6a49f602ddab8d57e..d326ae7dfc7830a0081c3b13233d67ef08b12eff 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -21,6 +21,8 @@ class FMPadding_Batch(HLSCustomOp): "Padding": ("i", True, 2), # number of channels in input image "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), # FINN input datatype "inputDataType": ("s", True, ""), # controls distribution of padded pixels @@ -55,20 +57,22 @@ class FMPadding_Batch(HLSCustomOp): return oshape def get_folded_input_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_input_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) def get_folded_output_shape(self): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ret = list(self.get_normal_output_shape()) - ret.insert(-1, 1) - return tuple(ret) + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() @@ -114,15 +118,13 @@ class FMPadding_Batch(HLSCustomOp): def get_instream_width(self): ibits = self.get_input_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return ibits * num_ch + simd = self.get_nodeattr("SIMD") + return ibits * simd def get_outstream_width(self): obits = self.get_output_datatype().bitwidth() - num_ch = self.get_nodeattr("NumChannels") - - return obits * num_ch + simd = self.get_nodeattr("SIMD") + return obits * simd def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() @@ -135,13 +137,15 @@ class FMPadding_Batch(HLSCustomOp): self.code_gen_dict["$DEFINES$"] = [ """#define ImgDim1 {}\n#define OutputDim1 {}\n #define Padding1 {}\n#define NumChannels1 {}\n - #define PaddingStyle1 {}\n#define numReps {}\n""".format( + #define PaddingStyle1 {}\n#define numReps {} + #define SIMD1 {}\n""".format( self.get_nodeattr("ImgDim"), self.get_padded_odim(), self.get_nodeattr("Padding"), self.get_nodeattr("NumChannels"), self.get_nodeattr("PaddingStyle"), self.get_nodeattr("numInputVectors"), + self.get_nodeattr("SIMD"), ) ] @@ -176,7 +180,7 @@ class FMPadding_Batch(HLSCustomOp): in_t = self.get_input_datatype().get_hls_datatype_str() node = self.onnx_node self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ImgDim1, OutputDim1, Padding1, NumChannels1, + """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format( node.op_type, in_t ) @@ -232,6 +236,7 @@ class FMPadding_Batch(HLSCustomOp): node = self.onnx_node exp_ishape = self.get_normal_input_shape() exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() folded_oshape = self.get_folded_output_shape() if mode == "cppsim": @@ -254,10 +259,8 @@ class FMPadding_Batch(HLSCustomOp): match expected shape (1, ImgDim, ImgDim, NumChannels).""" export_idt = self.get_input_datatype() - # no reshaping for input since assuming no folding on input - # make copy before saving array - inp = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), inp) + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) if mode == "cppsim": # execute the precompiled model diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py index 9e6c63dc510aab5f6baff9cb6326a2d0476f67a9..83152dea6cc494b8464c78605399b21b38d48b80 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py @@ -75,16 +75,19 @@ class GlobalAccPool_Batch(HLSCustomOp): def get_normal_output_shape(self): ch = self.get_nodeattr("NumChannels") vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple([vecs[0]] + [ch]) + if len(vecs) == 1: + oshape = tuple(vecs + [ch]) + elif len(vecs) == 3: + oshape = tuple([vecs[0]] + [1, 1, ch]) return oshape def get_folded_output_shape(self): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) + unfolded_shape = list(self.get_normal_output_shape()) assert ch % pe == 0, "PE must divide NumChannels" folds = int(ch / pe) - oshape = tuple([vecs[0]] + [folds, pe]) + oshape = tuple(unfolded_shape[:-1] + [folds, pe]) return oshape def make_shape_compatible_op(self, model): diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py new file mode 100644 index 0000000000000000000000000000000000000000..9b718ecbbc490610790b68871080de23a54f4891 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/iodma.py @@ -0,0 +1,346 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import math +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp + + +# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream +# direction "in": pulls data from AXI-MM to AXI stream +# direction "out": pushes data from AXI stream to AXI-MM + +# DMA Addressing +# - burst mode can be "wrap" or "increment" +# - "increment" bursts will increment the address when moving to the next image +# - "wrap" bursts will reinitialize the address to the start address, +# and are useful for e.g. streaming weights, where the same buffer is +# repeatedly read into the FPGA +# - no additional alignment restrictions beyond anything specified in the AXI spec + +# Interfaces +# - AXI-MM name specified by intfName unless this is set to "" (empty, the default) +# in which case output AXI-MM are named "out" and input AXI-MM are named "in0" +# - AXI-MM interface width (in bits) is specified by intfWidth +# - AXI-Stream interface width (in bits) is specified by streamWidth +# - If inftWidth and streamWidth are not equal, the DMA core performs +# width conversion by going up to the least common multiple of bitwidths +# e.g. intfWidth=32b -> 96b -> sreamWidth=24b +# - transfers occur in multiples of the AXI-MM interface width, therefore +# the total number of bits in the tensor must be a multiple of intfWidth +# - transfers occur in multiples of the AXI-Stream interface width, therefore +# the total number of bits in the tensor must be a multiple of streamWidth +# - both interface widths must be a multiple of 8b (AXI protocol requirement) +# - in most systems, intfWidth is also restricted to a power of 2 (e.g. Vitis) +# but this is not universal so we don't check here explicitly + +# Input/output tensor sizes shapes +# - The data being moved is a tensor of shape numInputVectors+[NumChannels] +# - The data type of the tensor elements is specified by dataType +# - on the stream side +# -the normal shape is the same as the ONNX tensor attached to it +# -the folded shape is computed from the stream width and normal shape +# - on the AXI-MM side +# -the normal shape is the same as the one on the stream side +# -the folded shape is not defined + + +class IODMA(HLSCustomOp): + """Class that corresponds to finn-hlslib DMA function(s).""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + # FINN input datatype + "dataType": ("s", True, ""), + # Stream parameters + "streamWidth": ("i", False, 32), + # DMA-specific parameters + "intfWidth": ("i", False, 32), + "burstMode": ("s", False, "increment"), + "direction": ("s", False, "in"), + # shape describing input vecs per execution + "numInputVectors": ("ints", False, [1]), + # name of axi-mm interface + "intfName": ("s", False, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + vecs = list(self.get_nodeattr("numInputVectors")) + num_ch = self.get_nodeattr("NumChannels") + ishape = tuple(vecs + [num_ch]) + return ishape + + def get_normal_output_shape(self): + return self.get_normal_input_shape() + + def get_folded_input_shape(self): + if self.get_nodeattr("direction") == "in": + raise ValueError("Folded input shape not defined for input IODMA") + else: + shape = list(self.get_normal_input_shape()) + itype_bits = self.get_input_datatype().bitwidth() + intfw = self.get_nodeattr("streamWidth") + assert ( + intfw % itype_bits == 0 + ), "Input stream width must be a multiple of datatype bits" + elems_per_word = intfw // itype_bits + assert shape[-1] % elems_per_word == 0, "Fold depth must be integer" + fold_depth = shape[-1] // elems_per_word + shape[-1] = fold_depth + shape.append(elems_per_word) + return tuple(shape) + + def get_folded_output_shape(self): + if self.get_nodeattr("direction") == "out": + raise ValueError("Folded output shape not defined for output IODMA") + else: + shape = list(self.get_normal_output_shape()) + itype_bits = self.get_output_datatype().bitwidth() + intfw = self.get_nodeattr("streamWidth") + assert ( + intfw % itype_bits == 0 + ), "Input stream width must be a multiple of datatype bits" + elems_per_word = intfw // itype_bits + assert shape[-1] % elems_per_word == 0, "Fold depth must be integer" + fold_depth = shape[-1] // elems_per_word + shape[-1] = fold_depth + shape.append(elems_per_word) + return tuple(shape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + exp_idtype = self.get_input_datatype() + assert dtype == exp_idtype, "Unexpected datatype." + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self): + if self.get_nodeattr("direction") == "in": + return self.get_nodeattr("intfWidth") + elif self.get_nodeattr("direction") == "out": + return self.get_nodeattr("streamWidth") + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + + def get_outstream_width(self): + if self.get_nodeattr("direction") == "out": + return self.get_nodeattr("intfWidth") + elif self.get_nodeattr("direction") == "in": + return self.get_nodeattr("streamWidth") + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + + def get_number_output_values(self): + oshape = self.get_normal_output_shape() + itype_bits = self.get_input_datatype().bitwidth() + intfw = self.get_nodeattr("intfWidth") + nelems = np.prod(oshape) + nbits = nelems * itype_bits + assert nbits % intfw == 0, "DMA: total transfer size must be word multiple" + ovalues = nbits // intfw + return ovalues + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "dma.h"'] + self.code_gen_dict["$GLOBALS$"].append('#include "streamtools.h"') + + def defines(self, var): + itype_bits = self.get_input_datatype().bitwidth() + total_bits = itype_bits * np.prod(self.get_normal_input_shape()) + assert total_bits % 8 == 0, "DMA input not a multiple of 1 Byte" + total_bytes = total_bits // 8 + self.code_gen_dict["$DEFINES$"] = [ + """#define NumBytes1 {}\n#define DataWidth1 {}\n""".format( + total_bytes, self.get_nodeattr("intfWidth") + ) + ] + + def get_ap_int_max_w(self): + "Return the maximum width of any ap_int used in this module." + instream = self.get_instream_width() + outstream = self.get_outstream_width() + width_lcm = (instream * outstream) // math.gcd(instream, outstream) + return width_lcm + + def docompute(self): + direction = self.get_nodeattr("direction") + mode = self.get_nodeattr("burstMode") + if direction == "in": + if mode == "wrap": + func = "Mem2Stream_Batch_external_wmem" + else: + func = "Mem2Stream_Batch" + dwc_func = "WidthAdjustedOutputStream" + elif direction == "out": + func = "Stream2Mem_Batch" + dwc_func = "WidthAdjustedInputStream" + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + # define templates for instantiation + dma_inst_template = func + "<DataWidth1, NumBytes1>(%s, %s, numReps);" + dwc_inst_template = dwc_func + "<%d, %d, %d> %s(%s, numReps);" + # do stream infrastructure and instantiations + intfw = self.get_nodeattr("intfWidth") + strmw = self.get_nodeattr("streamWidth") + width_lcm = (strmw * intfw) // math.gcd(strmw, intfw) + # we always need two streams: one of width_lcm, and one of intfw width + # because we use WidthAdjustedInputStream, + dtype_bits = self.get_input_datatype().bitwidth() + total_bits = dtype_bits * np.prod(self.get_normal_input_shape()) + if direction == "in": + self.code_gen_dict["$DOCOMPUTE$"] = [ + dwc_inst_template + % (width_lcm, strmw, total_bits // width_lcm, "dwc_lcm", "out"), + dwc_inst_template + % (intfw, width_lcm, total_bits // intfw, "dwc_intfw", "dwc_lcm"), + dma_inst_template % ("in0", "dwc_intfw"), + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + dwc_inst_template + % (strmw, width_lcm, total_bits // strmw, "dwc_lcm", "in0"), + dwc_inst_template + % (width_lcm, intfw, total_bits // width_lcm, "dwc_intfw", "dwc_lcm"), + dma_inst_template % ("dwc_intfw", "out"), + ] + + def blackboxfunction(self): + packed_ibits = self.get_instream_width() + packed_hls_type_in = "ap_uint<%d>" % packed_ibits + packed_obits = self.get_outstream_width() + packed_hls_type_out = "ap_uint<%d>" % packed_obits + direction = self.get_nodeattr("direction") + if direction == "in": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)" + % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out) + ] + elif direction == "out": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)" + % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out) + ] + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE s_axilite port=numReps bundle=control" + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=return bundle=control" + ) + direction = self.get_nodeattr("direction") + intfname = self.get_nodeattr("intfName") + if direction == "in": + if intfname == "": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=in0" + ) + else: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname) + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=in0 bundle=control" + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out" + ) + elif direction == "out": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=in0" + ) + if intfname == "": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=out" + ) + else: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname) + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=out bundle=control" + ) + else: + raise ValueError("Invalid IODMA direction, please set to in or out") + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW") + + def execute_node(self, context, graph): + pass + + def dataoutstrm(self): + pass + + def read_npy_data(self): + pass + + def save_as_npy(self): + pass + + def strm_decl(self): + pass diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..c7edc24d0e24eef1154293caca2519ab3aa68358 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pool_batch.py @@ -0,0 +1,395 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import numpy as np + +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.core.datatype import DataType +from onnx import TensorProto, helper +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class Pool_Batch(HLSCustomOp): + """Class that corresponds to finn-hlslib Pool_batch function. + Requires ConvolutionInputGenerator(depthwise == 1) to format its input + + TODO: explain input shape (to reuse im2col code) + Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels) + Output shape (BatchSize,OutImgDim,OutImgDim,Channels) + + # note: the actual data layout produced by the hlslib kernels is different + # for depthwise ops. + # * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + + Channels can be folded using PE (SIMD from the input perspective) + TODO: doc + """ + + def get_nodeattr_types(self): + my_attrs = { + "Channels": ("i", True, 0), + "PE": ("i", True, 1), + "KernelSize": ("i", True, 0), + # Function: + # - MaxPool + # - AvgPool (not yet supported, but HLSLIB does) + # - AccPool (not yet supported, but HLSLIB does) + "Function": ("s", True, ""), + "OutImgDim": ("i", True, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + "BatchSize": ("i", False, 1), + } + + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + # Same as input + return DataType[self.get_nodeattr("dataType")] + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + def get_normal_input_shape(self): + ifm_ch = self.get_nodeattr("Channels") + odim = self.get_nodeattr("OutImgDim") + batch_size = self.get_nodeattr("BatchSize") + k = self.get_nodeattr("KernelSize") + ishape = (batch_size, odim, odim, k * k * ifm_ch) + return ishape + + def get_folded_input_shape(self): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(normal_ishape[-1] / pe) + folded_ishape = normal_ishape[:-1] + [fold, pe] + return tuple(folded_ishape) + + def get_normal_output_shape(self): + ofm_ch = self.get_nodeattr("Channels") + odim = self.get_nodeattr("OutImgDim") + batch_size = self.get_nodeattr("BatchSize") + oshape = (batch_size, odim, odim, ofm_ch) + return oshape + + def get_folded_output_shape(self): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(ifm_ch / pe) + folded_oshape = normal_oshape[:-1] + [fold, pe] + return tuple(folded_oshape) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[1:-1]) + + def get_instream_width(self): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + # ofm_ch = self.get_nodeattr("Channels") + # k = self.get_nodeattr("KernelSize") + # assert ifm_ch % pe == 0, "PE must divide input channels" + # simd = int(ifm_ch/pe) + in_width = int(dt_bits * pe) + return in_width + + def get_outstream_width(self): + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + return self.get_instream_width() + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + info_messages = [] + + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""Pool_Batch needs 1 data input""") + + # check supported function + fnx = self.get_nodeattr("Function") + if fnx == "MaxPool": + info_messages.append( + "Attribute Function contains a supported pool function" + ) + else: + info_messages.append( + "Attribute Function contains an unsupported pool function" + ) + return info_messages + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "pool.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("Channels") + self.code_gen_dict["$DEFINES$"] += ["#define Channels {}".format(ifm_ch)] + + pe = self.get_nodeattr("PE") + self.code_gen_dict["$DEFINES$"] += ["#define PE {}".format(pe)] + + k = self.get_nodeattr("KernelSize") + self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k)] + + odim = self.get_nodeattr("OutImgDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + numReps = self.get_nodeattr("BatchSize") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(numReps)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + idt = self.get_input_datatype() + i_hls_dt = idt.get_hls_datatype_str() + odt = self.get_output_datatype() + o_hls_dt = odt.get_hls_datatype_str() + + self.code_gen_dict["$DOCOMPUTE$"] = [] + + fxn = self.get_nodeattr("Function") + if fxn == "MaxPool": + self.code_gen_dict["$DOCOMPUTE$"] += [ + "MaxPoolFunction<{},KernelSize> pool_fxn;".format(i_hls_dt) + ] + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + self.code_gen_dict["$DOCOMPUTE$"] += [ + """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > > + (in0,out, pool_fxn, OFMDim*OFMDim*numReps);""".format( + i_hls_dt, o_hls_dt + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_ibits = self.get_instream_width() + packed_in_hls_type = "ap_uint<%d>" % packed_ibits + + packed_obits = self.get_outstream_width() + packed_out_hls_type = "ap_uint<%d>" % packed_obits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (batch_size,odim,odim,k*k*ifm_ch).""" + + export_idt = self.get_input_datatype() + reshaped_input = inp.reshape(folded_ishape) + + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim, ofm_dim, k*k*ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index 9b73ba1e100aa83fd19aa8799195c99891fca3fd..a7ebff68749120868cae9ce5ac18d2856fe2cb8a 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -240,11 +240,21 @@ class StreamingFCLayer_Batch(HLSCustomOp): Q = self.get_nodeattr("SIMD") wdt = self.get_weight_datatype() W = wdt.bitwidth() - D_in = self.get_instream_width() - D_out = self.get_outstream_width() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") omega = (D_in * D_out) / (Q * P) return P * (math.ceil(omega / 512)) * (math.ceil((Q * W) / 36)) + def bram_efficiency_estimation(self): + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + bram16_est = self.bram_estimation() + wbits = W * D_in * D_out + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + def lut_estimation(self): """Calculates resource estimations for LUTs based on: - FINN-R: An End-to-End Deep-Learning Framework for Fast @@ -290,12 +300,15 @@ class StreamingFCLayer_Batch(HLSCustomOp): return out_width def get_weightstream_width(self): - """Returns weight stream width. Used in decoupled mode.""" - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wp = self.get_weight_datatype().bitwidth() - w_width = pe * simd * wp - return w_width + """Returns weight stream width. Used only in decoupled mode.""" + if self.get_nodeattr("mem_mode") == "decoupled": + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + w_width = pe * simd * wp + return w_width + else: + return 0 def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 1a8216f64bf71b7fb9f1f8becf4732970b5bf451..1da60a5124fa86b4336bae8fd1a587672f2f2e6f 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -99,6 +99,7 @@ set_top $config_toplevelfxn open_solution sol1 set_part $config_proj_part +config_compile -ignore_long_run_time -disable_unroll_code_size_check config_interface -m_axi_addr64 config_rtl -auto_prefix $EXTRA_DIRECTIVES$ diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py index 25ea05e3607a52731ae1b64de421837bf137ee2b..17ba44b959577faf573d77ae222f7b2a3be6669d 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py @@ -30,20 +30,30 @@ from finn.custom_op.fpgadataflow import HLSCustomOp class TLastMarker(HLSCustomOp): - """Class that corresponds to the TLastMarker node that needs to be - inserted at the end of the model for rtlsim with stitched IP. - It marks the end of the current image/input sample.""" + """Node that adds/removes AXI stream TLAST signals where needed. Its behavior + is transparent in node-by-node execution, only visible in IP-stitched rtlsim or + actual hardware. + This node may be needed at the end of the network to signal a DMA write (needed by the + FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read.""" def __init__(self, onnx_node): super().__init__(onnx_node) def get_nodeattr_types(self): my_attrs = { + # number of (static) iterations until TLAST=1 is generated for Direction=out "NumIters": ("i", True, 0), + # whether static or dynamic (from AXI lite) number of iterations are used + "DynIters": ("i", False, 1), + # direction: whether to insert or remove TLAST + "Direction": ("s", False, "out"), # width of input-output data streams, in bits "StreamWidth": ("i", True, 0), # width of individual element in stream, in bits "ElemWidth": ("i", True, 0), + # Protocol: external or internal + # Vitis docs recommend using qdma_axis for external, ap_axiu for internal + "Protocol": ("s", False, "external"), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -76,12 +86,33 @@ class TLastMarker(HLSCustomOp): def defines(self, var): stream_width = self.get_nodeattr("StreamWidth") + direction = self.get_nodeattr("Direction") + protocol = self.get_nodeattr("Protocol") # output stream must have TLAST, so we use this stream data type: # qdma_axis<stream_data_width,0,0,0 > - out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + if direction == "out": + if protocol == "external": + out_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + out_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + in_stream_dtype = "ap_uint<%d>" % stream_width + elif direction == "in": + out_stream_dtype = "ap_uint<%d>" % stream_width + if protocol == "external": + in_stream_dtype = "qdma_axis<%d,0,0,0>" % stream_width + elif protocol == "internal": + in_stream_dtype = "ap_axiu<%d,0,0,0>" % stream_width + else: + raise Exception("Unrecognized Protocol in TLastMarker") + else: + raise Exception("Unrecognized Direction in TLastMarker") + self.code_gen_dict["$DEFINES$"] = [ "#define StreamWidth %d" % stream_width, "#define OutDType %s" % out_stream_dtype, + "#define InDType %s" % in_stream_dtype, "#define NumItersPerImg %d" % self.get_nodeattr("NumIters"), ] @@ -89,27 +120,60 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$READNPYDATA$"] = [] def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - "unsigned int n = 1;", - "OutDType t;", - "t.set_keep(-1);", - "io_section: { // start of cycle accurate region", - "#pragma HLS protocol fixed", - "// do a first read from stream before we decide on numIters", - "// giving software a chance to set up the numIters prior to startup", - "t.set_data(in0.read());", - "n = (numIters == 0 ? NumItersPerImg : numIters);", - "t.set_last(n==1);", - "out.write(t);", - "} // end of cycle accurate region", - "// do one less iteration than spec since we already did one", - "for(unsigned int i=1; i<n; i++) {", - "#pragma HLS PIPELINE II=1", - "t.set_data(in0.read());", - "t.set_last(i==(n-1));", - "out.write(t);", - "}", - ] + dyn_iters = self.get_nodeattr("DynIters") + direction = self.get_nodeattr("Direction") + use_qdma_axis = self.get_nodeattr("Protocol") == "external" + if direction == "in": + # read from input and just pass data along; ignore tlast + # no dyn iters on input, it doesnt make sense + self.code_gen_dict["$DOCOMPUTE$"] = [ + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "out.write(in0.read().get_data());" + if use_qdma_axis + else "out.write(in0.read().data);", + "}", + ] + + elif dyn_iters == 1: + # output, with dynamic iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "io_section: { // start of cycle accurate region", + "#pragma HLS protocol fixed", + "// do a first read from stream before we decide on numIters", + "// giving software a chance to set up the numIters prior to startup", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "n = (numIters == 0 ? NumItersPerImg : numIters);", + "t.set_last(n==1);" if use_qdma_axis else "t.last = (n==1);", + "out.write(t);", + "} // end of cycle accurate region", + "// do one less iteration than spec since we already did one", + "for(unsigned int i=1; i<n; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(n-1));" if use_qdma_axis else "t.last = (i==(n-1));", + "out.write(t);", + "}", + ] + + else: + # output, with static iteration counts + self.code_gen_dict["$DOCOMPUTE$"] = [ + "unsigned int n = 1;", + "OutDType t;", + "t.set_keep(-1);" if use_qdma_axis else "t.keep = -1;", + "for(unsigned int i=0; i<NumItersPerImg; i++) {", + "#pragma HLS PIPELINE II=1", + "t.set_data(in0.read());" if use_qdma_axis else "t.data = in0.read();", + "t.set_last(i==(NumItersPerImg-1));" + if use_qdma_axis + else "t.last = (i==(NumItersPerImg-1));", + "out.write(t);", + "}", + ] def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = [] @@ -118,18 +182,30 @@ class TLastMarker(HLSCustomOp): self.code_gen_dict["$SAVEASCNPY$"] = [] def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void %s(hls::stream<ap_uint<StreamWidth> > &in0, - hls::stream<OutDType> &out, unsigned int numIters)""" - % self.onnx_node.name - ] + dyn_iters = self.get_nodeattr("DynIters") + + if dyn_iters == 1: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, + hls::stream<OutDType> &out, unsigned int numIters)""" + % self.onnx_node.name + ] + else: + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<InDType> &in0, hls::stream<OutDType> &out)""" + % self.onnx_node.name + ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" - ) + + dyn_iters = self.get_nodeattr("DynIters") + if dyn_iters == 1: + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" + ) + self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) @@ -158,7 +234,7 @@ class TLastMarker(HLSCustomOp): def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream<InDType> in0 ("in0");' ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream<OutDType> out ("out");' diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py index 82a6b140f7af1be4e5c0f429d077b99c7865383e..8ed0041704d421dab587f08bcbcd9e739e8434e9 100644 --- a/src/finn/custom_op/im2col.py +++ b/src/finn/custom_op/im2col.py @@ -80,6 +80,8 @@ class Im2Col(CustomOp): "input_shape": ("s", True, ""), "pad_amount": ("i", False, 0), "pad_value": ("i", False, 0), + # depthwise: if != 0, infer ConvolutionInputGenerator with depthwise == 1 + "depthwise": ("i", False, 0), } def make_shape_compatible_op(self, model): diff --git a/src/finn/custom_op/quantavgpool2d.py b/src/finn/custom_op/quantavgpool2d.py index 3bc328a9f4f6670041d33491d58af6c553bafac9..fb5c78bc0c8419ba519c5c3113d9b0c7ae2dd3b7 100644 --- a/src/finn/custom_op/quantavgpool2d.py +++ b/src/finn/custom_op/quantavgpool2d.py @@ -4,6 +4,7 @@ import onnxruntime as rt from finn.custom_op import CustomOp from finn.core.datatype import DataType +from finn.custom_op.maxpoolnhwc import compute_pool_output_dim class QuantAvgPool2d(CustomOp): @@ -16,20 +17,51 @@ class QuantAvgPool2d(CustomOp): "kernel": ("i", True, 1), "ibits": ("i", True, 1), "obits": ("i", True, 1), + # determines if values are signed (set to "1") or unsigned ("0") "signed": ("i", True, 0), + # data layout attribute can be set to "NCHW" or "NHWC" + "data_layout": ("s", False, "NCHW"), } def make_shape_compatible_op(self, model): node = self.onnx_node k = self.get_nodeattr("kernel") s = self.get_nodeattr("stride") - return helper.make_node( - "AveragePool", - inputs=[node.input[0]], - outputs=[node.output[0]], - kernel_shape=[k, k], - strides=[s, s], - ) + data_layout = self.get_nodeattr("data_layout") + if data_layout == "NCHW": + return helper.make_node( + "AveragePool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=[k, k], + strides=[s, s], + ) + elif data_layout == "NHWC": + iname = node.input[0] + ishape = model.get_tensor_shape(iname) + (n, hi, wi, c) = ishape + ho = compute_pool_output_dim(hi, k, s) + wo = compute_pool_output_dim(wi, k, s) + oshape = (n, ho, wo, c) + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + else: + raise Exception( + """Datalayout for QuantAvgPool2d is set to an invalid value. + Has to be set to "NCHW" or "NHWC".""" + ) def infer_node_datatype(self, model): node = self.onnx_node @@ -48,8 +80,12 @@ class QuantAvgPool2d(CustomOp): node = self.onnx_node k = self.get_nodeattr("kernel") s = self.get_nodeattr("stride") - ishape = context[node.input[0]].shape + inp_values = context[node.input[0]] oshape = context[node.output[0]].shape + if self.get_nodeattr("data_layout") == "NHWC": + inp_values = inp_values.transpose(0, 3, 1, 2) + oshape = (context[node.output[0]]).transpose(0, 3, 1, 2).shape + ishape = inp_values.shape inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) node_avgpool = helper.make_node( @@ -66,7 +102,7 @@ class QuantAvgPool2d(CustomOp): outputs=[outp], ) model_avgpool = helper.make_model(graph_avgpool) - idict = {node.input[0]: context[node.input[0]]} + idict = {node.input[0]: inp_values} sess = rt.InferenceSession(model_avgpool.SerializeToString()) result_temp = sess.run(None, idict) # remove scaling introduced by average @@ -77,7 +113,16 @@ class QuantAvgPool2d(CustomOp): max_bit_width = int(max_value).bit_length() shift_bits = max_bit_width - self.get_nodeattr("obits") result = np.right_shift(result_temp.astype(int), shift_bits) + if self.get_nodeattr("data_layout") == "NHWC": + result = result.transpose(0, 2, 3, 1) context[node.output[0]] = result.astype(np.float32) def verify_node(self): - pass + info_messages = [] + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + return info_messages diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 2dae826cf9712bef17d0053a0878c41ef51fec36..e4317e02d46df90c8fd0c8854262ca6eb0ea4f48 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -31,6 +31,7 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( ConvolutionInputGenerator, ) +from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.streamingfclayer_batch import StreamingFCLayer_Batch from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO @@ -44,17 +45,21 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch -from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch +from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch +from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.quantavgpool2d import QuantAvgPool2d from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch +from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch +from finn.custom_op.fpgadataflow.iodma import IODMA # create a mapping of all known CustomOp names and classes custom_op = {} custom_op["MultiThreshold"] = MultiThreshold +custom_op["DownSampler"] = DownSampler custom_op["XnorPopcountMatMul"] = XnorPopcountMatMul custom_op["Im2Col"] = Im2Col custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch @@ -66,12 +71,15 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch +custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["QuantAvgPool2d"] = QuantAvgPool2d custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch +custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch +custom_op["IODMA"] = IODMA def getCustomOp(node): diff --git a/src/finn/transformation/bipolar_to_xnor.py b/src/finn/transformation/bipolar_to_xnor.py index 8b65cfee17edd5d89fcca0bd86da12415d38fe78..80f2a73351f8548c99efd8dedd8a04d44c8558a3 100644 --- a/src/finn/transformation/bipolar_to_xnor.py +++ b/src/finn/transformation/bipolar_to_xnor.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +import warnings from onnx import TensorProto from onnx import helper as oh @@ -66,26 +67,40 @@ class ConvertBipolarMatMulToXnorPopcount(Transformation): mt_chain = model.find_upstream(mm_input, find_prod_mt) if len(mt_chain) == 0: - raise Exception( - """Could not find upstream bipolar - MultiThreshold""" - ) - graph_modified = True - mt = mt_chain[-1] - mt_inst = getCustomOp(mt) - # ensure old scale/bias were correct for BIPOLAR - scale_ok = mt_inst.get_nodeattr("out_scale") == 2.0 - bias_ok = mt_inst.get_nodeattr("out_bias") == -1.0 - assert ( - scale_ok and bias_ok - ), """Unexpected scale/bias - attributes for BIPOLAR MultiThreshold node.""" - # start conversion, set MT output to binary - # (this is what XnorPopcountMatMul expects) - mt_inst.set_nodeattr("out_dtype", "BINARY") - mt_inst.set_nodeattr("out_scale", 1.0) - mt_inst.set_nodeattr("out_bias", 0.0) - model.set_tensor_datatype(mm_input, DataType.BINARY) + if mm_input == graph.input[0].name: + # change input datatype to BINARY + model.set_tensor_datatype(mm_input, DataType.BINARY) + graph_modified = True + warnings.warn( + """IMPORTANT: Changing graph input DataType + to BINARY instead of BIPOLAR. Ensure this is respected + when checking for correctness. + """ + ) + else: + raise Exception( + """Could not find upstream bipolar + MultiThreshold, and the MatMul is not the + first node on graph input. Unable to convert + input tensor from BIPOLAR to BINARY.""" + ) + else: + graph_modified = True + mt = mt_chain[-1] + mt_inst = getCustomOp(mt) + # ensure old scale/bias were correct for BIPOLAR + scale_ok = mt_inst.get_nodeattr("out_scale") == 2.0 + bias_ok = mt_inst.get_nodeattr("out_bias") == -1.0 + assert ( + scale_ok and bias_ok + ), """Unexpected scale/bias + attributes for BIPOLAR MultiThreshold node.""" + # start conversion, set MT output to binary + # (this is what XnorPopcountMatMul expects) + mt_inst.set_nodeattr("out_dtype", "BINARY") + mt_inst.set_nodeattr("out_scale", 1.0) + mt_inst.set_nodeattr("out_bias", 0.0) + model.set_tensor_datatype(mm_input, DataType.BINARY) # change node type and domain n.op_type = "XnorPopcountMatMul" n.domain = "finn" diff --git a/src/finn/transformation/change_datalayout.py b/src/finn/transformation/change_datalayout.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b393a25e57122b059a44f70904a6dbe5bbaa3f --- /dev/null +++ b/src/finn/transformation/change_datalayout.py @@ -0,0 +1,110 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from onnx import helper, TensorProto + +from finn.transformation import Transformation +from finn.transformation.infer_shapes import InferShapes +from finn.util.basic import get_by_name + + +class ChangeDataLayoutQuantAvgPool2d(Transformation): + """Replace QuantAvgPool2d with datalayout (N,C,H,W) with Transpose nodes + and QuantAvgPool2dNHWC with datalayout (N,H,W,C)""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "QuantAvgPool2d" and ( + get_by_name(n.attribute, "data_layout") is None + or get_by_name(n.attribute, "data_layout").s.decode("UTF-8") == "NCHW" + ): + graph_modified = True + node_input = n.input[0] + node_output = n.output[0] + s = get_by_name(n.attribute, "stride").i + k = get_by_name(n.attribute, "kernel").i + ibits = get_by_name(n.attribute, "ibits").i + obits = get_by_name(n.attribute, "obits").i + signed = get_by_name(n.attribute, "signed").i + batchsize = model.get_tensor_shape(n.input[0])[0] # assume NCHW + channels = model.get_tensor_shape(n.input[0])[1] # assume NCHW + idim = model.get_tensor_shape(n.input[0])[-1] # assume NCHW + odim = model.get_tensor_shape(n.output[0])[-1] # assume NCHW + + # create new nodes + # NCHW -> NHWC + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (batchsize, idim, idim, channels), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + quantavg_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (batchsize, odim, odim, channels), + ) + graph.value_info.append(quantavg_out) + quantavg_out = quantavg_out.name + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + quantavg_node = helper.make_node( + "QuantAvgPool2d", + [inp_trans_out], + [quantavg_out], + domain="finn", + stride=s, + kernel=k, + ibits=ibits, + obits=obits, + signed=signed, + data_layout="NHWC", + ) + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [quantavg_out], [node_output], perm=[0, 3, 1, 2] + ) + # insert nodes + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, quantavg_node) + graph.node.insert(node_ind + 2, out_trans_node) + # remove old nodes + graph.node.remove(n) + + # set shapes + model.set_tensor_shape(inp_trans_out, (batchsize, idim, idim, channels)) + model.set_tensor_shape(quantavg_out, (batchsize, odim, odim, channels)) + model = model.transform(InferShapes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index 207075b00de1871da19ea78472125d435449ed6e..62ee92df54eee2b63d84657515d7fbc3a8808b81 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -69,6 +69,9 @@ class AnnotateResources(Transformation): total_dict[r_type] += r_amount else: total_dict[r_type] = r_amount + for k in total_dict.keys(): + if "efficiency" in k: + total_dict[k] = total_dict[k] / len(graph.node) model.set_metadata_prop("res_total_" + self.mode, str(total_dict)) for node in graph.node: if _is_fpgadataflow_node(node) and node.name in res_dict.keys(): diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index d421a5f3ef8ca980b399087de1482b2ae913da1b..34a697a43426aae0f984770689552063aa35b9e8 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from onnx import helper, TensorProto +import numpy as np from finn.core.datatype import DataType from finn.transformation import Transformation @@ -34,6 +35,10 @@ from finn.custom_op.registry import getCustomOp from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes import finn.core.data_layout as DataLayout +from finn.util.onnx import nchw_to_nhwc +import warnings +from finn.util.basic import get_by_name +import warnings class InferConvInpGen(Transformation): @@ -51,11 +56,15 @@ class InferConvInpGen(Transformation): i2c_in_shape = model.get_tensor_shape(i2c_input) i2c_out_shape = model.get_tensor_shape(i2c_output) dt = model.get_tensor_datatype(i2c_input) + if not dt.is_integer(): + warnings.warn("Input is not int. Can't infer ConvInpGen") + continue i2c_inst = getCustomOp(n) stride = i2c_inst.get_nodeattr("stride") k = i2c_inst.get_nodeattr("kernel_size") pad = i2c_inst.get_nodeattr("pad_amount") pad_val = i2c_inst.get_nodeattr("pad_value") + depthwise = i2c_inst.get_nodeattr("depthwise") ifm_ch = i2c_in_shape[-1] ifm_dim = i2c_in_shape[1] ofm_dim = i2c_out_shape[1] @@ -67,7 +76,11 @@ class InferConvInpGen(Transformation): if pad > 0: # if padding enabled, ensure pad_val supported by DataType - assert dt.allowed(pad_val), "Im2Col DataType must support pad_val" + # assert dt.allowed(pad_val),"""FMPadding_Batch DataType + # must support pad_val""" + assert ( + pad_val == 0 + ), "FMPadding_Batch doesn't currently support pad_val!= 0" odim_padding = ifm_dim + 2 * pad @@ -97,23 +110,40 @@ class InferConvInpGen(Transformation): ) graph.node.insert(node_ind, padding_node) - # create equivalent ConvolutionInputGenerator node - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", - [ConvInpGen_input], - [i2c_output], - domain="finn", - backend="fpgadataflow", - ConvKernelDim=k, - IFMChannels=ifm_ch, - IFMDim=ConvInpGen_idim, - OFMDim=ofm_dim, - SIMD=ifm_ch, - Stride=stride, - inputDataType=dt.name, - outputDataType=dt.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + if stride > 1 and k == 1: + # create DownSampler node + ConvInpGen_node = helper.make_node( + "DownSampler", + [ConvInpGen_input], + [i2c_output], + domain="finn", + backend="fpgadataflow", + ImgDim=ConvInpGen_idim, + NumChannels=ifm_ch, + SIMD=ifm_ch, + Stride=stride, + inputDataType=dt.name, + ) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + else: + # create equivalent ConvolutionInputGenerator node + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator", + [ConvInpGen_input], + [i2c_output], + domain="finn", + backend="fpgadataflow", + ConvKernelDim=k, + IFMChannels=ifm_ch, + IFMDim=ConvInpGen_idim, + OFMDim=ofm_dim, + SIMD=ifm_ch, + Stride=stride, + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + ) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -169,6 +199,137 @@ class InferStreamingMaxPool(Transformation): return (model, graph_modified) +class InferPool_Batch(Transformation): + """If kernel_shape > strides, replace Pool layer with with of Im2col + + pool(with kernel_shape == strides), plus Transpose layers to keep the original + data layout.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type in ["MaxPool"]: + # extract pool parameters + k = get_by_name(n.attribute, "kernel_shape").ints[-1] + stride = get_by_name(n.attribute, "strides").ints[-1] + + if k <= stride: + continue + + try: + pad = get_by_name(n.attribute, "pads").ints[-1] + except AttributeError: + pad = 0 + + node_input = n.input[0] + node_output = n.output[0] + idt = model.get_tensor_datatype(node_input) + if not idt.is_integer(): + continue + + # odt = model.get_tensor_datatype(node_output) + + ifm_ch = model.get_tensor_shape(n.input[0])[1] # assume NCHW + ofm_ch = ifm_ch + ifm_dim = model.get_tensor_shape(n.input[0])[-1] # assume NCHW + ofm_dim = model.get_tensor_shape(n.output[0])[-1] # assume NCHW + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ifm_dim, ifm_dim, ifm_ch), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + model.set_tensor_datatype(inp_trans_out, idt) + + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim, ofm_dim, ifm_ch * k * k), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) + + pool_output = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim, ofm_dim, ofm_ch), + ) + graph.value_info.append(pool_output) + pool_output = pool_output.name + # model.set_tensor_datatype(pool_output, odt) + + # create new nodes + # NCHW -> NHWC + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + + if n.op_type == "MaxPool": + pool_fxn = "MaxPool" + pad_value = idt.min() + else: + raise Exception( + "pad_value and pool_fxn not configured for {}".format(n.op_type) + ) + + # format input tensor + im2col_node = helper.make_node( + "Im2Col", + [inp_trans_out], + [im2col_out], + domain="finn", + stride=stride, + kernel_size=k, + pad_amount=pad, + pad_value=pad_value, + depthwise=1, + input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch), + ) + + # Warning PE has to be equal to ifm_ch until Im2Col is replaced by + # ConvolutionInputGenerator with depthwise=1. + # For other settings the output will be incorrect due to incorrect input + # data layout + pool_node = helper.make_node( + "Pool_Batch", + [im2col_out], + [pool_output], + domain="finn", + backend="fpgadataflow", + dataType=idt.name, + Channels=ifm_ch, + PE=ifm_ch, + KernelSize=k, + Function=pool_fxn, + OutImgDim=ofm_dim, + BatchSize=1, + ) + + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] + ) + + # insert nodes where the conv is to preserve topological ordering + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, im2col_node) + graph.node.insert(node_ind + 2, pool_node) + graph.node.insert(node_ind + 3, out_trans_node) + # remove old node + graph.node.remove(n) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferBinaryStreamingFCLayer(Transformation): """Convert XnorPopcountMatMul layers to StreamingFCLayer_Batch layers. Any immediately following MultiThreshold @@ -489,3 +650,243 @@ class InferThresholdingLayer(Transformation): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferChannelwiseLinearLayer(Transformation): + """Convert any channel-wise Add/Mul into a HLS layer.""" + + def get_smallest_possible(self, vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.__members__: + dt = DataType[k] + + if dt in [DataType.BIPOLAR, DataType.TERNARY, DataType.FLOAT32]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType.UINT32 + else: + return DataType.INT32 + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Add" or node.op_type == "Mul": + # assuming input[0] is dynamic + ll_input = node.input[0] + ll_output = node.output[0] + ll_in_shape = model.get_tensor_shape(ll_input) + + # check if input 1 has an initializer + ll_const = node.input[1] + if ll_const is not None: + ll_cinit = model.get_initializer(ll_const) + if ll_cinit is None: + # input 1 is also dynamic + continue + else: + continue + + # get number of channels and channel index from input + ll_in_layout = model.get_tensor_layout(ll_input) + if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: + ch_index = -1 + ch = ll_in_shape[-1] + elif ll_in_layout == DataLayout.NCHW: + ch_index = 1 + ch = ll_in_shape[1] + else: + continue + + # check if the shape of initializer is compatible + ll_cinit_shape = list(ll_cinit.shape) + if np.prod(ll_cinit_shape) == 1: + warnings.warn( + "Broadcasting " + str(node.op_type) + "(" + node.name + ")" + ) + ll_cinit = np.full((ch), ll_cinit.flatten()[0]) + elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: + # parameter shape not compatible with Channelwise_batch + continue + + # check initializer contains integers as floats + if not (ll_cinit.astype(np.int32) == ll_cinit).all(): + continue + # all initializer conditions are met + + # check inputs + idt = model.get_tensor_datatype(ll_input) + if not idt.is_integer(): + # skip conversion for layers with float input + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + if ll_in_layout == DataLayout.NCHW: + ll_input = nchw_to_nhwc(ll_input, model, node_ind) + node_ind += 1 + ll_in_shape = model.get_tensor_shape(ll_input) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + ll_output_layout = model.get_tensor_layout(ll_output) + if ll_output_layout == DataLayout.NCHW: + ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) + node_ind += 1 + + # get parameter data type + param_min = min(ll_cinit.flatten()) + param_max = max(ll_cinit.flatten()) + pdt = self.get_smallest_possible([param_min, param_max]) + + # set function and determine output data type + if node.op_type == "Add": + func = "add" + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = self.get_smallest_possible([out_min, out_max]) + elif node.op_type == "Mul": + func = "mul" + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = self.get_smallest_possible(possible_limits) + + model.set_initializer(ll_const, ll_cinit.reshape(ch)) + model.set_tensor_datatype(ll_output, odt) + + # create node with no parallelization first + pe = 1 + assert ch % pe == 0, "Requirement IFC divisable by PE is violated." + # create and insert node + new_node = helper.make_node( + "ChannelwiseOp_Batch", + [ll_input, ll_const], + [ll_output], + domain="finn", + backend="fpgadataflow", + Func=func, + NumChannels=ch, + PE=pe, + inputDataType=idt.name, + paramDataType=pdt.name, + outputDataType=odt.name, + numInputVectors=list(ll_in_shape[:-1]), + ) + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferGlobalAccPoolLayer(Transformation): + """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "GlobalAveragePool": + in0 = node.input[0] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + + idt = model.get_tensor_datatype(in0) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + num_ch = int(in0_shape[-1]) + vecs = in0_shape[:-1] + # create node with no parallelization first + pe = 1 + assert ( + num_ch % pe == 0 + ), "Requirement Labels divisable by PE is violated." + + # create an additional tensor of the same shape and layout as result + out_shape = model.get_tensor_shape(result) + pool_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(pool_out) + pool_out = pool_out.name + model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) + + new_pool = helper.make_node( + "GlobalAccPool_Batch", + [in0], + [pool_out], + domain="finn", + backend="fpgadataflow", + NumChannels=num_ch, + PE=pe, + inputDataType=idt.name, + numInputVectors=vecs, + ) + + mul_value = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] + ) + model.graph.value_info.append(mul_value) + model.set_initializer(mul_value.name, np.array(1 / (vecs[1] * vecs[2]))) + new_mul = helper.make_node("Mul", [pool_out, mul_value.name], [result],) + graph.node.insert(insert_point, new_pool) + graph.node.insert(insert_point + 1, new_mul) + node_ind += 1 + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py new file mode 100644 index 0000000000000000000000000000000000000000..1d9a51875499d77f384c03f54009a9dd1001dea0 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -0,0 +1,80 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.registry import getCustomOp +from finn.transformation import Transformation +from finn.util.basic import get_by_name + + +class Floorplan(Transformation): + """Perform Floorplanning of the dataflow design. Separate DMAs into their own + partitions IDs, and TODO: split the design into sections of defined size""" + + def __init__(self, limits=None): + super().__init__() + self.resource_limits = limits + + def apply(self, model): + target_partition_id = 0 + # we currently assume that all dataflow nodes belonging to the same partition + # are connected to each other and there is a single input/output to/from each. + all_nodes = list(model.graph.node) + df_nodes = list( + filter(lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes) + ) + dma_nodes = list(filter(lambda x: x.op_type == "IODMA", df_nodes)) + + non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes)) + dyn_tlastmarker_nodes = list( + filter( + lambda x: x.op_type == "TLastMarker" + and getCustomOp(x).get_nodeattr("DynIters") == "true", + non_dma_nodes, + ) + ) + + non_dma_nodes = list( + filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes) + ) + + for node in dma_nodes: + node_inst = getCustomOp(node) + node_inst.set_nodeattr("partition_id", target_partition_id) + target_partition_id += 1 + + for node in dyn_tlastmarker_nodes: + node_inst = getCustomOp(node) + node_inst.set_nodeattr("partition_id", target_partition_id) + target_partition_id += 1 + + for node in non_dma_nodes: + # TODO: implement proper floorplanning; for now just a single partition + node_inst = getCustomOp(node) + node_inst.set_nodeattr("partition_id", target_partition_id) + + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py new file mode 100644 index 0000000000000000000000000000000000000000..e4368edea717f7499481e9b1c6ac20f7d5bb5f58 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -0,0 +1,198 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from onnx import TensorProto +from onnx import helper as oh + +from finn.util.basic import get_by_name +from finn.custom_op.registry import getCustomOp +from finn.transformation import Transformation +from finn.transformation.general import SortGraph +import finn.core.data_layout as DataLayout +import math +import numpy as np + + +class InsertIODMA(Transformation): + """Insert DMA nodes on all inputs and outputs.""" + + def __init__(self, max_intfwidth=32): + super().__init__() + assert ( + 2 ** math.log2(max_intfwidth) == max_intfwidth + ), "max_intfwidth must be a power of 2" + self.max_intfwidth = max_intfwidth + + def apply(self, model): + # only makes sense for a pure fpgadataflow graph -- so we check! + all_nodes = list(model.graph.node) + assert all( + get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow" + for x in all_nodes + ) + # parse streamingfclayers looking for external weights with no attached IODMA + fc_extw_nodes = list( + filter( + lambda x: x.op_type == "StreamingFCLayer_Batch" + and get_by_name(x.attribute, "mem_mode") is not None + and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external" + and model.find_producer(x.input[1]) is None, + all_nodes, + ) + ) + graph_in_name = model.graph.input[0].name + first_node = model.find_consumer(graph_in_name) + graph_out_name = model.graph.output[0].name + final_node = model.find_producer(graph_out_name) + if ( + final_node.op_type == "IODMA" + and first_node.op_type == "IODMA" + and len(fc_extw_nodes) == 0 + ): + # TODO maybe check the correctness of properties + return (model, False) + else: + if final_node.op_type != "IODMA": + # check if tensor is NHWC + assert ( + model.get_tensor_layout(graph_out_name) == DataLayout.NHWC + or model.get_tensor_layout(graph_in_name) == DataLayout.NC + ), "Data layout of tensors must be NHWC or NC" + out_shape = model.get_tensor_shape(graph_out_name) + out_dtype = model.get_tensor_datatype(graph_out_name) + # determine the feasible interface width + transfer_bits = np.prod(out_shape) * out_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert ( + intfwidth % 8 == 0 + ), "No feasible interface width for transfer size" + # get width of stream input to DMA + streamWidth = getCustomOp(final_node).get_outstream_width() + # make new buffer + final_node_out = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(final_node_out) + model.set_tensor_datatype(final_node_out.name, out_dtype) + # reroute final node output to final_node_out_name + final_node.output[0] = final_node_out.name + dma_node = oh.make_node( + "IODMA", + [final_node_out.name], + [graph_out_name], + numInputVectors=out_shape[:-1], + NumChannels=out_shape[-1], + dataType=str(out_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="out", + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.append(dma_node) + if first_node.op_type != "IODMA": + # check if tensor is NHWC + assert ( + model.get_tensor_layout(graph_in_name) == DataLayout.NHWC + or model.get_tensor_layout(graph_in_name) == DataLayout.NC + ), "Data layout of tensors must be NHWC or NC" + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + # determine the feasible interface width + transfer_bits = np.prod(in_shape) * in_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert ( + intfwidth % 8 == 0 + ), "No feasible interface width for transfer size" + # get width of stream output from DMA + streamWidth = getCustomOp(first_node).get_instream_width() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + # reroute final node output to final_node_out_name + first_node.input[0] = first_node_in.name + dma_node = oh.make_node( + "IODMA", + [graph_in_name], + [first_node_in.name], + numInputVectors=in_shape[:-1], + NumChannels=in_shape[-1], + dataType=str(in_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="in", + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(0, dma_node) + for fc_node in fc_extw_nodes: + # check if tensor is NHWC + assert ( + model.get_tensor_layout(fc_node.input[1]) == DataLayout.NHWC + or model.get_tensor_layout(graph_in_name) == DataLayout.NC + ), "Data layout of tensors must be NHWC or NC" + fc_w_name = fc_node.input[1] + w_shape = model.get_tensor_shape(fc_w_name) + w_dtype = model.get_tensor_datatype(fc_w_name) + # determine the feasible interface width + transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() + intfwidth = math.gcd(transfer_bits, self.max_intfwidth) + assert ( + intfwidth % 8 == 0 + ), "No feasible interface width for transfer size" + # calculate width of stream output from DMA + pe = get_by_name(fc_node.attribute, "PE").i + simd = get_by_name(fc_node.attribute, "SIMD").i + streamWidth = simd * pe * w_dtype.bitwidth() + # make new buffer + fc_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape + ) + model.graph.value_info.append(fc_node_in) + model.set_tensor_datatype(fc_node_in.name, w_dtype) + dma_node = oh.make_node( + "IODMA", + [fc_w_name], + [fc_node_in.name], + numInputVectors=w_shape[:-1], + NumChannels=w_shape[-1], + dataType=str(w_dtype.name), + intfWidth=intfwidth, + streamWidth=streamWidth, + direction="in", + burstMode="wrap", + domain="finn", + backend="fpgadataflow", + ) + fc_node.input[1] = fc_node_in.name + model.graph.node.insert(0, dma_node) + model = model.transform(SortGraph()) + return (model, True) diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 32f32ece585a93465ba32fede45d5eb606a2b0a3..04dd437af27b9fbe18b2255c20a8e4acda03b3d0 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -31,23 +31,34 @@ from onnx import helper as oh from finn.custom_op.registry import getCustomOp from finn.transformation import Transformation +from finn.util.basic import get_by_name + +import numpy as np class InsertTLastMarker(Transformation): - """Ensure that the graph is terminated with a TLastMarker node, inserting - one if necessary.""" + """Ensure that the graph is started/terminated with a TLastMarker node, inserting + one if necessary. Use constructor args to determine type of TLastMarker to be inserted. + More information available on the TLastMarker documentation. + """ - def __init__(self): + def __init__(self, both=False, external=True, dynamic=True): super().__init__() + self.dyniters = dynamic + self.external = external + self.both = both def apply(self, model): # TODO only makes sense for a pure fpgadataflow graph -- check! graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) - if final_node.op_type == "TLastMarker": - # TODO maybe check the correctness of properties - return (model, False) - else: + graph_modified = False + if final_node.op_type != "TLastMarker" and not ( + final_node.op_type == "IODMA" + and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") + == "out" + ): + custom_op = getCustomOp(final_node) num_iters = int(custom_op.get_number_output_values()) stream_width = int(custom_op.get_outstream_width()) @@ -69,8 +80,51 @@ class InsertTLastMarker(Transformation): NumIters=num_iters, StreamWidth=stream_width, ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="out", + Protocol=("external" if self.external else "internal"), domain="finn", backend="fpgadataflow", ) model.graph.node.append(tlast_node) - return (model, True) + graph_modified = True + # if both is True, also insert marker on input + if self.both: + graph_in_name = model.graph.input[0].name + first_node = model.find_consumer(graph_in_name) + if first_node.op_type != "TLastMarker" and not ( + first_node.op_type == "IODMA" + and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") + == "in" + ): + + custom_op = getCustomOp(first_node) + num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) + stream_width = int(custom_op.get_instream_width()) + in_shape = model.get_tensor_shape(graph_in_name) + in_dtype = model.get_tensor_datatype(graph_in_name) + elem_width = in_dtype.bitwidth() + # make new buffer + first_node_in = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape + ) + model.graph.value_info.append(first_node_in) + model.set_tensor_datatype(first_node_in.name, in_dtype) + # reroute final node output to first_node_in_name + first_node.input[0] = first_node_in.name + tlast_node = oh.make_node( + "TLastMarker", + [graph_in_name], + [first_node_in.name], + NumIters=num_iters, + StreamWidth=stream_width, + ElemWidth=elem_width, + DynIters=(1 if self.dyniters else 0), + Direction="in", + Protocol=("external" if self.external else "internal"), + domain="finn", + backend="fpgadataflow", + ) + model.graph.node.insert(0, tlast_node) + graph_modified = True + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index 4f050be8540ddf5ef48699d1658b571852ff4510..6eae560e1191642cfaf85d92c6d0fcf644630973 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -80,7 +80,6 @@ class PrepareCppSim(Transformation): self._num_workers = mp.cpu_count() def prepareCppSim_node(self, node): - print(node.name) if is_fpgadataflow_node(node) is True: _codegen_single_node(node, self.model) return (node, False) diff --git a/src/finn/transformation/infer_data_layouts.py b/src/finn/transformation/infer_data_layouts.py index 9ac75578ffb911cc44cfddc2b2119b55e6abf2dd..e7a6b88239a1735d5379e165333f8356ae6f88a1 100644 --- a/src/finn/transformation/infer_data_layouts.py +++ b/src/finn/transformation/infer_data_layouts.py @@ -38,7 +38,7 @@ def _dims_to_layout(model, node, ndims): return DataLayout.NC else: if node.domain == "finn": - if node.op_type == "MultiThreshold": + if node.op_type == "MultiThreshold" or node.op_type == "QuantAvgPool2d": mt_inst = registry.getCustomOp(node) layout = mt_inst.get_nodeattr("data_layout") if layout == "NHWC" and ndims == 4: diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py index c9c73fa4c8303ee28bc1cc6aee879d633740e01e..d7686eaadcbc800542ab96c5f45145857412b773 100644 --- a/src/finn/transformation/streamline/__init__.py +++ b/src/finn/transformation/streamline/__init__.py @@ -41,6 +41,7 @@ from finn.transformation.streamline.absorb import ( FactorOutMulSignMagnitude, Absorb1BitMulIntoMatMul, Absorb1BitMulIntoConv, + AbsorbSignBiasIntoMultiThreshold, ) from finn.transformation.streamline.collapse_repeated import ( @@ -52,13 +53,14 @@ from finn.transformation.streamline.reorder import ( MoveAddPastMul, MoveScalarMulPastMatMul, MoveScalarAddPastMatMul, - MoveScalarAddPastConv, + MoveAddPastConv, MoveScalarMulPastConv, ) from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.transformation.streamline.sign_to_thres import ConvertSignToThres from finn.transformation.batchnorm_to_affine import BatchNormToAffine +from finn.transformation.streamline.remove import RemoveIdentityOps class Streamline(Transformation): @@ -70,9 +72,10 @@ class Streamline(Transformation): ConvertDivToMul(), BatchNormToAffine(), ConvertSignToThres(), + AbsorbSignBiasIntoMultiThreshold(), MoveAddPastMul(), MoveScalarAddPastMatMul(), - MoveScalarAddPastConv(), + MoveAddPastConv(), MoveScalarMulPastMatMul(), MoveScalarMulPastConv(), MoveAddPastMul(), @@ -87,6 +90,7 @@ class Streamline(Transformation): ] for trn in streamline_transformations: model = model.transform(trn) + model = model.transform(RemoveIdentityOps()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index 3dfd4a007eab3319cca546abd44359e182ffa4aa..f04c6ba9a79457ff23bd84fbb92a37756be86fe8 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -28,14 +28,81 @@ import numpy as np from onnx import helper as oh +import warnings from finn.core.datatype import DataType +import finn.core.data_layout as DataLayout from finn.transformation import Transformation from finn.util.basic import get_by_name from finn.custom_op.registry import getCustomOp +from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes +class AbsorbSignBiasIntoMultiThreshold(Transformation): + """Absorb scalar bias originating from signed int export back into + MultiThreshold and re-evaluate the output datatype.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + # search for (MultiThreshold, Add) pair + node_ind += 1 + if ( + n.op_type == "MultiThreshold" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + consumer = model.find_consumer(n.output[0]) + if consumer is not None and consumer.op_type == "Add": + mt_node = n + add_node = consumer + threshold_name = mt_node.input[1] + add_weight_name = add_node.input[1] + T = model.get_initializer(threshold_name) + A = model.get_initializer(add_weight_name) + if (A is None) or (T is None): + warnings.warn("Threshold or add bias not constant, skipping") + continue + end_name = add_node.output[0] + # we can only absorb scalar adds + is_scalar = A.ndim == 0 or all(x == 1 for x in A.shape) + if not is_scalar: + continue + bias = A.flatten()[0] + # set MultiThreshold bias property + mt_inst = getCustomOp(mt_node) + bias += mt_inst.get_nodeattr("out_bias") + mt_inst.set_nodeattr("out_bias", bias) + graph_modified = True + # compute new DataType for MultiThreshold output + steps = T.shape[-1] + new_min = bias + new_max = steps + bias + odt = DataType.get_smallest_possible(steps).name.replace( + "UINT", "INT" + ) + odt = DataType[odt] + assert odt.allowed(new_max) and odt.allowed( + new_min + ), """Could + not compute new MultiThreshold DataType (min = %d max = %d)""" % ( + new_min, + new_max, + ) + mt_inst.set_nodeattr("out_dtype", odt.name) + # remove Add node, rewire MultiThreshold + graph.node.remove(add_node) + mt_node.output[0] = end_name + # set datatype + model.set_tensor_datatype(end_name, odt) + if graph_modified: + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class AbsorbAddIntoMultiThreshold(Transformation): """Absorb preceding Add ops into MultiThreshold by updating the threshold values. Only scalar/1D add vectors can be absorbed.""" @@ -292,3 +359,99 @@ class AbsorbTransposeIntoMultiThreshold(Transformation): if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified) + +class AbsorbTransposeIntoFlatten(Transformation): + """Absorb transpose node into succeeding flatten node, if H=W=1 and the first + dimension stays the same. Can also be applied if flatten is implemented implicitly + by a reshape node with shape [1, -1] and the first input dimension is 1""" + + def apply(self, model): + graph = model.graph + graph_modified = False + node_ind = 0 + for n in graph.node: + node_ind += 1 + if ( + n.op_type == "Reshape" + and (model.get_initializer(n.input[1]) == [1, -1]).all() + ) or n.op_type == "Flatten": + prod = model.find_producer(n.input[0]) + if ( + prod is not None + and prod.op_type == "Transpose" + # we ensure that the first dimension is not changed from the + # transpose operation + and get_by_name(prod.attribute, "perm").ints[0] == 0 + ): + data_layout = model.get_tensor_layout(prod.input[0]) + # check for the data layout to interpret input shape correctly + if data_layout is None: + warnings.warn( + """Data layout for input tensor of Transpose node is not set. + To use AbsorbTransposeIntoFlatten transformation + please set tensor data layout.""" + ) + continue + elif data_layout == DataLayout.NCHW: + (b, c, h, w) = model.get_tensor_shape(prod.input[0]) + # if h=w=1 the transposition can be absorbed, otherwise + # the absorption would lead to an error in the behavior + if h != 1 or w != 1: + continue + # the flatten node from onnx keeps by default the first + # dim and flattens the rest, that is why this transformation + # can only work with b != 1 if the model contains already a + # flatten node and not a reshape node with shape = [1, -1]. + # If the first dim of the input tensor is not 1, flatten and + # reshape (with shape = [1, -1]) would lead to different results + if n.op_type == "Reshape" and b != 1: + continue + elif data_layout == DataLayout.NHWC: + (b, h, w, c) = model.get_tensor_shape(prod.input[0]) + if h != 1 or w != 1: + continue + if n.op_type == "Reshape" and b != 1: + continue + # create single flatten node and remove obsolete nodes + node = oh.make_node("Flatten", [prod.input[0]], [n.output[0]]) + graph.node.remove(n) + graph.node.remove(prod) + graph.node.insert(node_ind, node) + graph_modified = True + if graph_modified: + model = model.transform(InferDataTypes()) + return (model, graph_modified) + +class AbsorbScalarMulIntoTopK(Transformation): + """Absorb a mul node into a suceeding topk node if the mul is scalar.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "TopK": + prod = model.find_producer(n.input[0]) + if prod is not None and prod.op_type == "Mul": + prod_input = prod.input[0] + param_name = prod.input[1] + A = model.get_initializer(param_name) + if A is None: + warnings.warn("Param is not constant, skipping") + continue + if all(x == 1 for x in A.shape) and A > 0: + # if the mul is scalar and positive, we can just delete the + # mul node and rewire the top k node. Because the top k node + # works with probabilities and their relation to each other + # the relation doesn't change if every value is multiplied + # with a scalar + graph.node.remove(prod) + n.input[0] = prod_input + # to avoid error the dataype is set to float32 + model.set_tensor_datatype(n.input[0], DataType.FLOAT32) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc4233ddafbc70c4d20d316ea72ea6bba1b82a8 --- /dev/null +++ b/src/finn/transformation/streamline/remove.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from finn.transformation import Transformation +from finn.transformation.infer_shapes import InferShapes +import numpy as np + +class RemoveIdentityOps(Transformation): + """Remove identity ops like Add/Sub with zero or Mul/Div with one""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type in ["Add", "Sub"] + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + A = model.get_initializer(n.input[1]) + if A is not None and (A == np.zeros_like(A)).all(): + producer = model.find_producer(n.input[0]) + # remove node and wire output tensor to + # output of producer node + producer.output[0] = n.output[0] + graph.node.remove(n) + + elif ( + n.op_type in ["Mul", "Div"] + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + A = model.get_initializer(n.input[1]) + if A is not None and (A == np.ones_like(A)).all(): + producer = model.find_producer(n.input[0]) + # remove node and wire output tensor to + # output of producer node + producer.output[0] = n.output[0] + graph.node.remove(n) + model = model.transform(InferShapes()) + return (model, graph_modified) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index b46b82c77a3f1b70a3b05d87cd3c48fc1d94fd45..2b03532ce3ba7d5159e5ae57e61c2af9c8c37fce 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -29,9 +29,14 @@ import numpy as np import warnings from onnx import helper as oh +from onnx import TensorProto from finn.transformation import Transformation +import finn.core.data_layout as DataLayout from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.core.datatype import DataType from finn.core.onnx_exec import execute_node from finn.util.basic import get_by_name from finn.custom_op.registry import getCustomOp @@ -67,8 +72,11 @@ class MoveAddPastMul(Transformation): add_weight_name = n.input[1] A = model.get_initializer(mul_weight_name) B = model.get_initializer(add_weight_name) - assert A is not None, "Initializer for mul weights is not set." - assert B is not None, "Initializer for add weights is not set." + if (A is None) or (B is None): + warnings.warn( + "Mul or add does not have constant params, skipping" + ) + continue start_name = n.input[0] middle_name = n.output[0] end_name = consumer.output[0] @@ -123,8 +131,9 @@ class MoveScalarMulPastMatMul(Transformation): matmul_weight_name = consumer.input[1] A = model.get_initializer(mul_weight_name) W = model.get_initializer(matmul_weight_name) - assert A is not None, "Initializer for mul weights is not set." - assert W is not None, "Initializer for matmul weights is not set." + if (A is None) or (W is None): + warnings.warn("MatMul or Mul params are not constant, skipping") + continue start_name = n.input[0] middle_name = n.output[0] end_name = consumer.output[0] @@ -180,8 +189,9 @@ class MoveScalarAddPastMatMul(Transformation): matmul_weight_name = consumer.input[1] A = model.get_initializer(add_weight_name) W = model.get_initializer(matmul_weight_name) - assert A is not None, "Initializer for add weights is not set." - assert W is not None, "Initializer for matmul weights is not set." + if (A is None) or (W is None): + warnings.warn("MatMul or Add params are not constant, skipping") + continue start_name = n.input[0] middle_name = n.output[0] end_name = consumer.output[0] @@ -215,8 +225,8 @@ class MoveScalarAddPastMatMul(Transformation): return (model, graph_modified) -class MoveScalarAddPastConv(Transformation): - """Move scalar add operations past conv operations. We want to have adds +class MoveAddPastConv(Transformation): + """Move scalar and channelwise add operations past conv operations. We want to have adds next to each other such that they can be collapsed into a single add.""" def apply(self, model): @@ -241,8 +251,12 @@ class MoveScalarAddPastConv(Transformation): add_weight_name = n.input[1] conv_in_name = consumer.input[0] conv_in_shape = model.get_tensor_shape(conv_in_name) + # assume datalayout to be NCHW + channels = conv_in_shape[1] A = model.get_initializer(add_weight_name) - assert A is not None, "Initializer for add weights is not set." + if A is None: + warnings.warn("Add param is not constant, skipping") + continue start_name = n.input[0] end_name = consumer.output[0] conv_out_shape = model.get_tensor_shape(end_name) @@ -251,11 +265,17 @@ class MoveScalarAddPastConv(Transformation): pads = list(get_by_name(consumer.attribute, "pads").ints) if sum(pads) == 0: using_padding = False - if all(x == 1 for x in A.shape) and not using_padding: + if ( + all(x == 1 for x in A.shape) or A.shape == (1, channels, 1, 1) + ) and not using_padding: # create a tensor filled with the add constant, in # the shape expected by the convolution conv_in_const = np.zeros(conv_in_shape, dtype=np.float32) - conv_in_const.fill(A.item()) + if A.shape == (1, channels, 1, 1): + for ch in range(channels): + conv_in_const[0][ch].fill(A[0][ch].item()) + else: + conv_in_const.fill(A.item()) # create an execution context and put in const input exec_ctx = model.make_empty_exec_context() exec_ctx[conv_in_name] = conv_in_const @@ -310,7 +330,9 @@ class MoveScalarMulPastConv(Transformation): ): mul_weight_name = n.input[1] A = model.get_initializer(mul_weight_name) - assert A is not None, "Initializer for mul weights is not set." + if A is None: + warnings.warn("Mul param is not constant, skipping") + continue conv_node = consumer mul_node = n start_name = mul_node.input[0] @@ -338,6 +360,71 @@ class MoveScalarMulPastConv(Transformation): return (model, graph_modified) +class MoveMulPastDWConv(Transformation): + """Move channelwise mul operations past depthwise conv operations. We want to have muls + next to each other such that they can be collapsed into a single mul.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type == "Mul" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + consumer = model.find_consumer(n.output[0]) + if ( + consumer is not None + and consumer.op_type == "Conv" + and not model.is_join_node(consumer) + ): + mul_weight_name = n.input[1] + A = model.get_initializer(mul_weight_name) + if A is None: + warnings.warn( + """Mul weight tensor is not set. If it is a constant, + please use set_initializer to set the tensor.""" + ) + continue + conv_node = consumer + mul_node = n + start_name = mul_node.input[0] + conv_in_name = conv_node.input[0] + conv_in_shape = model.get_tensor_shape(conv_in_name) + ifm_ch = conv_in_shape[1] + group_attribute = get_by_name(consumer.attribute, "group") + if group_attribute is None: + continue + group_attribute = group_attribute.i + conv_out_name = conv_node.output[0] + conv_out_shape = model.get_tensor_shape(conv_out_name) + if A.shape == (1, ifm_ch, 1, 1) and ifm_ch == group_attribute: + # if the mul is channelwise and conv is depthwise, + # we can simply swap the order of ops + # rewire mul input to be conv input + conv_node.input[0] = start_name + model.set_tensor_shape(start_name, conv_in_shape) + model.set_tensor_datatype(start_name, DataType.FLOAT32) + # use old conv input tensor as conv output + conv_node.output[0] = conv_in_name + model.set_tensor_shape(conv_in_name, conv_out_shape) + model.set_tensor_datatype(conv_in_name, DataType.FLOAT32) + # use new conv output as new mul node input + mul_node.input[0] = conv_in_name + # use old conv output as new mul node output + mul_node.output[0] = conv_out_name + model.set_tensor_datatype(conv_out_name, DataType.FLOAT32) + # move mul node past conv node + graph.node.remove(mul_node) + graph.node.insert(node_ind, mul_node) + graph_modified = True + model = model.transform(InferShapes()) + return (model, graph_modified) + + class MoveLinearPastEltwiseAdd(Transformation): """Move linear operations (mul, add) past elementwise add operations where possible. Specifically,matches and transforms the following patterns: @@ -597,3 +684,215 @@ class MoveMaxPoolPastMultiThreshold(Transformation): model = model.transform(InferShapes()) return (model, graph_modified) + +class MoveFlattenPastTopK(Transformation): + """Move flatten node past a succeeding topk node, if the "axis" attribute in topk + is set to -1 and the data layout before the flatten is NHWC with H=W=1""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "Flatten": + consumer = model.find_consumer(n.output[0]) + if consumer is not None and consumer.op_type == "TopK": + axis = get_by_name(consumer.attribute, "axis") + if axis is None or axis.i != -1: + continue + start_name = n.input[0] + data_layout = model.get_tensor_layout(start_name) + if data_layout != DataLayout.NHWC: + warnings.warn( + """Transformation can't be applied. The input + to flatten has to have DataLayout.NHWC""" + ) + continue + (b, h, w, c) = model.get_tensor_shape(start_name) + if h != 1 or w != 1: + continue + # get parameter k from topk + k = model.get_tensor_shape(consumer.output[1])[-1] + + # swap conections + # new tensor because dims change + middle_name = model.make_new_valueinfo_name() + topk_indices = oh.make_tensor_value_info( + middle_name, TensorProto.INT64, [b, h, w, k] + ) + end_name = consumer.output[1] + graph.value_info.append(topk_indices) + + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + + # set inputs and outputs correctly + consumer.input[0] = start_name + consumer.output[1] = middle_name + model.set_tensor_shape(consumer.output[0], (b, h, w, k)) + + n.input[0] = middle_name + n.output[0] = end_name + + # insert them back in + graph.node.insert(node_ind - 1, consumer) + graph.node.insert(node_ind, n) + + graph_modified = True + + model = model.transform(InferShapes()) + return (model, graph_modified) + +class MoveFlattenPastAffine(Transformation): + """Moves a node that implements a (1, -1) reshape past a MatMul, Mul or Add node.""" + + def apply(self, model): + graph = model.graph + graph_modified = False + node_ind = 0 + for n in graph.node: + node_ind += 1 + if ( + n.op_type == "Flatten" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + consumer = model.find_consumer(n.output[0]) + if ( + consumer is not None + and ( + consumer.op_type == "MatMul" + or consumer.op_type == "Mul" + or consumer.op_type == "Add" + ) + and not model.is_join_node(consumer) + ): + # move flatten past operation and rewire tensors + start_name = n.input[0] + # check if datalyout is set to NHWC and H=W=1 + datalayout = model.get_tensor_layout(start_name) + if datalayout == DataLayout.NHWC: + (b, h, w, c) = model.get_tensor_shape(start_name) + if h != 1 or w != 1: + warnings.warn( + """The Transformation can only be performed if + H=W=1.""" + ) + continue + else: + warnings.warn( + """The Transformation can only be performed on + operations that operate on data layout NHWC.""" + ) + continue + middle_name = n.output[0] + end_name = consumer.output[0] + op_param_name = consumer.input[1] + A = model.get_initializer(op_param_name) + if A is None: + warnings.warn("Param is not constant, skipping") + continue + op_in_dt = model.get_tensor_datatype(consumer.input[0]) + op_out_dt = model.get_tensor_datatype(consumer.output[0]) + start_shape = model.get_tensor_shape(start_name) + dummy_in = np.random.uniform(low=0, high=1, size=(start_shape)) + + if consumer.op_type == "MatMul": + dummy_out = np.matmul(dummy_in, A) + elif consumer.op_type == "Mul": + dummy_out = dummy_in * A + elif consumer.op_type == "Add": + dummy_out = dummy_in + A + + new_op = oh.make_node( + consumer.op_type, + [start_name, op_param_name], + [middle_name], + name=consumer.name, + ) + new_flatten = oh.make_node("Flatten", [middle_name], [end_name]) + graph.node.insert(node_ind, new_op) + graph.node.insert(node_ind + 1, new_flatten) + model.set_tensor_shape(middle_name, dummy_out.shape) + # because a flatten node doesn't change the datatype we need + # only the datatype of the op node + model.set_tensor_datatype(start_name, op_in_dt) + model.set_tensor_datatype(middle_name, op_out_dt) + model.set_tensor_datatype(end_name, op_out_dt) + # set datalayout + model.set_tensor_layout(start_name, DataLayout.NHWC) + model.set_tensor_layout(middle_name, DataLayout.NHWC) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + return (model, graph_modified) + +class MoveTransposePastScalarMul(Transformation): + """Moves a Transpose node past a scalar Mul node""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if ( + n.op_type == "Transpose" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): + consumer = model.find_consumer(n.output[0]) + if ( + consumer is not None + and consumer.op_type == "Mul" + and not model.is_join_node(consumer) + ): + mul_weight_name = consumer.input[1] + A = model.get_initializer(mul_weight_name) + if A is None: + warnings.warn("Mul param is not constant, skipping") + continue + transp_node = n + mul_node = consumer + start_name = transp_node.input[0] + middle_name = transp_node.output[0] + end_name = mul_node.output[0] + transp_in_shape = model.get_tensor_shape(start_name) + transp_out_shape = model.get_tensor_shape(middle_name) + transp_in_layout = model.get_tensor_layout(start_name) + transp_out_layout = model.get_tensor_layout(middle_name) + if transp_in_layout is None or transp_out_layout is None: + warnings.warn( + """Datalayout is not set for tensors. + Transformation can't be applied.""" + ) + continue + if all(x == 1 for x in A.shape): + # if the mul is scalar, we can simply swap the order of ops + # rewire transpose input to be mul input + mul_node.input[0] = start_name + model.set_tensor_shape(start_name, transp_in_shape) + model.set_tensor_layout(start_name, transp_in_layout) + mul_node.output[0] = middle_name + model.set_tensor_shape(middle_name, transp_in_shape) + model.set_tensor_layout(middle_name, transp_in_layout) + transp_node.input[0] = middle_name + transp_node.output[0] = end_name + model.set_tensor_shape(end_name, transp_out_shape) + model.set_tensor_layout(end_name, transp_out_layout) + graph.node.remove(transp_node) + graph.node.insert(node_ind, transp_node) + graph_modified = True + + if graph_modified is True: + model = model.transform(InferDataLayouts()) + model = model.transform(InferShapes()) + return (model, graph_modified) + diff --git a/src/finn/util/onnx.py b/src/finn/util/onnx.py index b9932111d86d7206b23e1d0e49a6aa8451f8ba24..4d7cdd126ededac887639a932c2021ef5f081c02 100644 --- a/src/finn/util/onnx.py +++ b/src/finn/util/onnx.py @@ -28,6 +28,7 @@ import numpy as np import onnx +import finn.core.data_layout as DataLayout def valueinfo_to_tensor(vi): @@ -37,3 +38,38 @@ def valueinfo_to_tensor(vi): return np.zeros( dims, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[vi.type.tensor_type.elem_type] ) + + +def nchw_to_nhwc(t, model, idx, reverse=False): + """Converts between NCHW <-> NHWC layouts for tensor t by inserting a transpose. + If reverse=False, t is assumed NCHW and we insert transpose to convert NCHW -> NHWC + If reverse=True, t is assumed NHWC and we insert transpose to convert NHWC -> NCHW. + """ + graph = model.graph + # create new NHWC tensor + t_shape = model.get_tensor_shape(t) + bs = t_shape[0] + ch = t_shape[1] + height = t_shape[2] + width = t_shape[3] + t_trans = onnx.helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + onnx.TensorProto.FLOAT, + (bs, height, width, ch), # NHWC + ) + graph.value_info.append(t_trans) + dt = model.get_tensor_datatype(t) + t_trans = t_trans.name + model.set_tensor_datatype(t_trans, dt) + model.set_tensor_layout(t_trans, DataLayout.NHWC) + # NCHW <-> NHWC transpose + if reverse: + t_trans_node = onnx.helper.make_node( + "Transpose", [t_trans], [t], perm=[0, 3, 1, 2] + ) + else: + t_trans_node = onnx.helper.make_node( + "Transpose", [t], [t_trans], perm=[0, 2, 3, 1] + ) + graph.node.insert(idx, t_trans_node) + return t_trans diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py index 24854a2153df9af78feb8352ca119e831a9ac9eb..e78812b21a03baa6963f1f0efaefdb4c73e4d0db 100644 --- a/tests/brevitas/test_brevitas_avg_pool_export.py +++ b/tests/brevitas/test_brevitas_avg_pool_export.py @@ -16,7 +16,7 @@ import finn.core.onnx_exec as oxe import pytest -export_onnx_path = "test_avg_pool.onnx" +export_onnx_path = "test_brevitas_avg_pool_export.onnx" @pytest.mark.parametrize("kernel_size", [2, 3]) diff --git a/tests/brevitas/test_brevitas_cnv.py b/tests/brevitas/test_brevitas_cnv.py index c04e16ad1923609c81240235057cc7a190c90ffb..f91ca600d3f0ce3b1cda3c29216fe8e0e3f415e4 100644 --- a/tests/brevitas/test_brevitas_cnv.py +++ b/tests/brevitas/test_brevitas_cnv.py @@ -42,7 +42,7 @@ from finn.transformation.general import GiveUniqueNodeNames from finn.transformation.double_to_single_float import DoubleToSingleFloat from finn.util.test import get_test_model_trained -export_onnx_path = "test_output_cnv.onnx" +export_onnx_path = "test_brevitas_cnv.onnx" @pytest.mark.parametrize("abits", [1, 2]) diff --git a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py b/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py index b66348a9902802bc65b2a35e8bc3e311cc81e0bc..9c7296b7b3b6d36cfb43b6d9e96e7fba6bbce49a 100644 --- a/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py +++ b/tests/brevitas/test_brevitas_non_scaled_QuantHardTanh_export.py @@ -12,7 +12,7 @@ import finn.core.onnx_exec as oxe from finn.transformation.infer_shapes import InferShapes from brevitas.core.quant import QuantType -export_onnx_path = "test_act.onnx" +export_onnx_path = "test_brevitas_non_scaled_QuantHardTanh_export.onnx" @pytest.mark.parametrize("abits", [1, 2, 4, 8]) diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py index c5ddad12ca3e8d353682fbb20449d44358485f69..77974dacb51aa8746ce33f9a490becd35390db5a 100644 --- a/tests/brevitas/test_brevitas_relu_act_export.py +++ b/tests/brevitas/test_brevitas_relu_act_export.py @@ -12,7 +12,7 @@ from finn.core.modelwrapper import ModelWrapper import finn.core.onnx_exec as oxe from finn.transformation.infer_shapes import InferShapes -export_onnx_path = "test_act.onnx" +export_onnx_path = "test_brevitas_relu_act_export.onnx" @pytest.mark.parametrize("abits", [1, 2, 4, 8]) diff --git a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py b/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py index d499f1517341477eca9915245da9ad12c346c5a9..e0ec82ebed44e2e984be9f62e02bc1721a7f9c33 100644 --- a/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py +++ b/tests/brevitas/test_brevitas_scaled_QHardTanh_export.py @@ -12,7 +12,7 @@ from finn.core.modelwrapper import ModelWrapper import finn.core.onnx_exec as oxe from finn.transformation.infer_shapes import InferShapes -export_onnx_path = "test_act.onnx" +export_onnx_path = "test_brevitas_scaled_QHardTanh_export.onnx" @pytest.mark.parametrize("abits", [2, 4, 8]) diff --git a/tests/core/test_basic_onnx_exec.py b/tests/core/test_basic_onnx_exec.py index 7b0412432cc6360cb9c42d66417bd187ed142563..ddb2cbfc40c7647970f0c51ecb95340e7d1dddae 100644 --- a/tests/core/test_basic_onnx_exec.py +++ b/tests/core/test_basic_onnx_exec.py @@ -49,19 +49,33 @@ def test_mnist_onnx_download_extract_run(): raw_o = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/output_0.pb") input_tensor = onnx.load_tensor_from_string(raw_i) output_tensor = onnx.load_tensor_from_string(raw_o) - # run using FINN-based execution + # run using FINN-based execution (full graph) input_dict = {"Input3": np_helper.to_array(input_tensor)} - output_dict = oxe.execute_onnx(model, input_dict) + output_dict = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) assert np.isclose( np_helper.to_array(output_tensor), output_dict["Plus214_Output_0"], atol=1e-3 ).all() + # test subgraph execution + start_node = model.graph.node[1] + end_node = model.graph.node[3] + subgraph_i_dict = {start_node.input[0]: output_dict[start_node.input[0]]} + subgraph_o_dict = oxe.execute_onnx( + model, + subgraph_i_dict, + return_full_exec_context=True, + start_node=start_node, + end_node=end_node, + ) + assert np.isclose( + subgraph_o_dict[end_node.output[0]], output_dict[end_node.output[0]], atol=1e-3 + ).all() def test_onnx_exec_internal_rounding(): inp0 = onnx.helper.make_tensor_value_info("inp0", onnx.TensorProto.FLOAT, [2, 2]) inp1 = onnx.helper.make_tensor_value_info("inp1", onnx.TensorProto.FLOAT, [1]) outp = onnx.helper.make_tensor_value_info("outp", onnx.TensorProto.FLOAT, [2, 2]) - mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"],) + mul_node = onnx.helper.make_node("Mul", inputs=["inp0", "inp1"], outputs=["outp"]) graph = onnx.helper.make_graph( nodes=[mul_node], name="mul_graph", inputs=[inp0, inp1], outputs=[outp] ) diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py index 5fa9b23bad5c5b67f65530c55f862f889c07b1ac..0fb7ae42f3bd556755f81a02be6c71fd73ffc519 100644 --- a/tests/core/test_modelwrapper.py +++ b/tests/core/test_modelwrapper.py @@ -36,7 +36,7 @@ import finn.core.data_layout as DataLayout from finn.core.modelwrapper import ModelWrapper from finn.util.test import get_test_model_trained -export_onnx_path = "test_output_lfc.onnx" +export_onnx_path = "test_modelwrapper.onnx" def test_modelwrapper(): diff --git a/tests/custom_op/test_xnorpopcountmatmul.py b/tests/custom_op/test_xnorpopcountmatmul.py index 37d9b7e5968bdb70023be9b70515410e941f51ce..745b782d418129d96e21c327a49de04d53aa7c48 100644 --- a/tests/custom_op/test_xnorpopcountmatmul.py +++ b/tests/custom_op/test_xnorpopcountmatmul.py @@ -47,7 +47,7 @@ from finn.transformation.infer_shapes import InferShapes from finn.transformation.streamline.sign_to_thres import ConvertSignToThres from finn.util.test import get_test_model_trained -export_onnx_path = "test_output_lfc.onnx" +export_onnx_path = "test_xnorpopcountmatmul.onnx" def test_xnorpopcountmatmul(): diff --git a/tests/end2end/test_end2end_cnv_w2a2.py b/tests/end2end/test_end2end_cnv_w2a2.py new file mode 100644 index 0000000000000000000000000000000000000000..31ccebd4c175ad2badef17499bf113d978b637f7 --- /dev/null +++ b/tests/end2end/test_end2end_cnv_w2a2.py @@ -0,0 +1,377 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os + +import numpy as np + +# as of Feb'20 there is a bug that segfaults ONNX shape inference if we +# import pytorch before onnx, so we make sure to import onnx first +import onnx # NOQA + +import pytest +import pkg_resources as pk +from finn.custom_op.registry import getCustomOp +from finn.core.onnx_exec import execute_onnx +from finn.transformation.double_to_single_float import DoubleToSingleFloat +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.move_reshape import RemoveCNVtoFCFlatten +from finn.transformation.fold_constants import FoldConstants +from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.transformation.streamline import Streamline +from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul +import finn.transformation.streamline.absorb as absorb +from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject +from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject +from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.util.basic import pynq_part_map +from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip +from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.core.throughput_test import throughput_test_rtlsim + +build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 +mem_mode = "decoupled" + + +def test_end2end_cnv_w2a2_export(): + import brevitas.onnx as bo + + cnv = get_test_model_trained("CNV", 2, 2) + bo.export_finn_onnx( + cnv, (1, 3, 32, 32), build_dir + "/end2end_cnv_w2a2_export.onnx" + ) + + +def test_end2end_cnv_w2a2_import_and_tidy(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_export.onnx") + model = model.transform(DoubleToSingleFloat()) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir + "/end2end_cnv_w2a2_tidy.onnx") + + +def test_end2end_cnv_w2a2_streamline(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_tidy.onnx") + model = model.transform(Streamline()) + model = model.transform(LowerConvsToMatMul()) + model = model.transform(MakeMaxPoolNHWC()) + model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(Streamline()) + model.save(build_dir + "/end2end_cnv_w2a2_streamlined.onnx") + + +def test_end2end_cnv_w2a2_convert_to_hls_layers(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_streamlined.onnx" + ) + model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode)) + model = model.transform(to_hls.InferConvInpGen()) + model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(RemoveCNVtoFCFlatten()) + model.save(build_dir + "/end2end_cnv_w2a2_hls_layers.onnx") + + +def test_end2end_cnv_w2a2_create_dataflow_partition(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_hls_layers.onnx" + ) + parent_model = model.transform(CreateDataflowPartition()) + parent_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx") + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename) + dataflow_model.save(build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx") + + +def test_end2end_cnv_w2a2_fold_and_tlastmarker(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_dataflow_model.onnx" + ) + fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") + # each tuple is (PE, SIMD, in_fifo_depth) for a layer + folding = [ + (8, 3, 256, "auto"), + (16, 16, 256, "auto"), + (8, 16, 256, "auto"), + (8, 16, 256, "block"), + (4, 8, 214, "auto"), + (1, 8, 2, "auto"), + (1, 2, 126, "distributed"), + (2, 2, 62, "block"), + (5, 1, 6, "distributed"), + ] + for fcl, (pe, simd, ififodepth, ramstyle) in zip(fc_layers, folding): + fcl_inst = getCustomOp(fcl) + fcl_inst.set_nodeattr("PE", pe) + fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("inFIFODepth", ififodepth) + fcl_inst.set_nodeattr("ram_style", ramstyle) + + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_idepth = [2, 51, 9, 106, 2, 2] + for i in range(len(swg_layers)): + swg_inst = getCustomOp(swg_layers[i]) + simd = folding[i][1] + swg_inst.set_nodeattr("SIMD", simd) + swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i]) + + model = model.transform(InsertDWC()) + model = model.transform(InsertFIFO()) + model = model.transform(InsertTLastMarker()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(AnnotateResources("estimate")) + model.save(build_dir + "/end2end_cnv_w2a2_folded.onnx") + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_cnv_w2a2_gen_hls_ip(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_folded.onnx") + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(AnnotateResources("hls")) + model.save(build_dir + "/end2end_cnv_w2a2_ipgen.onnx") + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_ip_stitch(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen.onnx") + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model.save(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx") + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_verify_dataflow_part(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx") + x = np.zeros((1, 32, 32, 3), dtype=np.float32) + inp_name = model.graph.input[0].name + out_name = model.graph.output[0].name + inp_dict = {inp_name: x} + # cppsim + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + model.save(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx") + ret_cppsim = execute_onnx(model, inp_dict, True) + res_cppsim = ret_cppsim[out_name] + # node-by-node rtlsim + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + model.save(build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx") + ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True) + res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name] + # whole-network (ip-stitched) rtlsim + model.set_metadata_prop("exec_mode", "rtlsim") + model.save(build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx") + # this is a particularly long-running test, set liveness thr. to unlimited + os.environ["LIVENESS_THRESHOLD"] = "-1" + ret_rtlsim_whole = execute_onnx(model, inp_dict, True) + res_rtlsim_whole = ret_rtlsim_whole[out_name] + assert np.isclose(res_cppsim, res_rtlsim_nodebynode).all() + assert np.isclose(res_cppsim, res_rtlsim_whole).all() + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_throughput_test_rtlsim(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx" + ) + model.set_metadata_prop("rtlsim_trace", "rtlsim_trace.vcd") + # os.environ["RTLSIM_TRACE_DEPTH"] = "4" + # run through IP-stitched rtlsim with increasing batch sizes and + # check the number of cycles it takes to execute + ret = throughput_test_rtlsim(model, 10) + # TODO check for expected performance + assert ret["cycles"] > 0 + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_verify_all(): + # use the streamlined model as the "golden" model for right answers + golden = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_streamlined.onnx" + ) + iname = golden.graph.input[0].name + oname = golden.graph.output[0].name + # load one of the test vectors + fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz") + input_tensor = np.load(fn)["arr_0"].astype(np.float32) + input_tensor = input_tensor / 255 + assert input_tensor.shape == (1, 3, 32, 32) + x = input_tensor + # x = np.zeros(ishape, dtype=np.float32) + ret_golden = execute_onnx(golden, {iname: x}, True) + y_golden = ret_golden[oname] + # set up parent+child graph to test + # we'll use models from the previous step as the child model + parent_model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx" + ) + iname = parent_model.graph.input[0].name + oname = parent_model.graph.output[0].name + # produce results with cppsim + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx") + sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_ipgen_cppsim.onnx") + ret_cppsim = execute_onnx(parent_model, {iname: x}, True) + y_cppsim = ret_cppsim[oname] + # produce results with node-by-node rtlsim + load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx" + ) + sdp_node.set_nodeattr( + "model", build_dir + "/end2end_cnv_w2a2_ipgen_nodebynode_rtlsim.onnx" + ) + ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True) + y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname] + # produce results with whole-network (stitched ip) rtlsim + load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx" + ) + sdp_node.set_nodeattr( + "model", build_dir + "/end2end_cnv_w2a2_ipstitch_whole_rtlsim.onnx" + ) + # this is a particularly long-running test, set liveness thr. to unlimited + os.environ["LIVENESS_THRESHOLD"] = "-1" + ret_whole_rtlsim = execute_onnx(parent_model, {iname: x}, True) + y_whole_rtlsim = ret_whole_rtlsim[oname] + assert np.isclose(y_golden, y_cppsim).all() + assert np.isclose(y_golden, y_nodebynode_rtlsim).all() + assert np.isclose(y_golden, y_whole_rtlsim).all() + assert np.argmax(y_golden) == 3 + + +@pytest.mark.vivado +def test_end2end_cnv_w2a2_make_pynq_proj(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_ipstitch.onnx") + model = model.transform(MakePYNQProject(test_pynq_board)) + model.save(build_dir + "/end2end_cnv_w2a2_pynq_project.onnx") + + +@pytest.mark.slow +@pytest.mark.vivado +def test_end2end_cnv_w2a2_synth_pynq_project(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_pynq_project.onnx" + ) + model = model.transform(SynthPYNQProject()) + model = model.transform(AnnotateResources("synth")) + model.save(build_dir + "/end2end_cnv_w2a2_synth.onnx") + + +def test_end2end_cnv_w2a2_make_driver(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_synth.onnx") + model = model.transform(MakePYNQDriver()) + model.save(build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx") + + +def test_end2end_cnv_w2a2_deploy_on_pynq(): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_pynq_driver.onnx" + ) + try: + ip = os.environ["PYNQ_IP"] # no fault for this one; skip if not defined + if ip == "": + pytest.skip("PYNQ board IP address not specified") + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + # save the model to be able to link it to the parent + model.save(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx") + except KeyError: + pytest.skip("PYNQ board IP address not specified") + + +def test_end2end_cnv_w2a2_run_on_pynq(): + # use the streamlined model as the "golden" model for right answers + golden = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_streamlined.onnx" + ) + iname = golden.graph.input[0].name + oname = golden.graph.output[0].name + # load one of the test vectors + fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz") + input_tensor = np.load(fn)["arr_0"].astype(np.float32) + input_tensor = input_tensor / 255 + assert input_tensor.shape == (1, 3, 32, 32) + x = input_tensor + # run using FINN-based execution + ret_golden = execute_onnx(golden, {iname: x}, True) + y_golden = ret_golden[oname] + # set up parent+child graph to test + # we'll use models from the previous step as the child model + parent_model = load_test_checkpoint_or_skip( + build_dir + "/end2end_cnv_w2a2_dataflow_parent.onnx" + ) + iname = parent_model.graph.input[0].name + oname = parent_model.graph.output[0].name + try: + ip = os.environ["PYNQ_IP"] # NOQA + if ip == "": + pytest.skip("PYNQ board IP address not specified") + # produce results with cppsim + sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + sdp_node = getCustomOp(sdp_node) + load_test_checkpoint_or_skip(build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx") + sdp_node.set_nodeattr("model", build_dir + "/end2end_cnv_w2a2_pynq_deploy.onnx") + ret = execute_onnx(parent_model, {iname: x}, True) + y = ret[oname] + assert np.isclose(y, y_golden).all() + assert np.argmax(y) == 3 + + except KeyError: + pytest.skip("PYNQ board IP address not specified") diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..d09c64a1250f78604c1a0a362cf234712de2cf57 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py @@ -0,0 +1,115 @@ +import pytest + +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.infer_shapes import InferShapes +import numpy as np + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape) + p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape) + + model = helper.make_model( + helper.make_graph( + name="test", + inputs=[inp], + outputs=[outp], + value_info=[p0], + nodes=[helper.make_node(onnx_op_name, ["inp", "p0"], ["outp"])], + ) + ) + + model = ModelWrapper(model) + model.set_initializer("p0", gen_finn_dt_tensor(pdt, pshape)) + model.set_tensor_datatype("inp", idt) + model.transform(InferDataLayouts(), make_deepcopy=False) + model.transform(InferShapes(), make_deepcopy=False) + return model + + +# parameter datatype +@pytest.mark.parametrize("pdt", [DataType.BIPOLAR, DataType.UINT4, DataType.INT2]) +# input datatype +@pytest.mark.parametrize("idt", [DataType.INT32, DataType.UINT4, DataType.INT4]) +# function +@pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"]) +# vector parameter or scalar parameter (broadcast) +@pytest.mark.parametrize("scalar_param", [True, False]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.vivado +@pytest.mark.slow +def test_convert_to_hls_channelwise_layer( + pdt, idt, onnx_op_name, scalar_param, exec_mode +): + ifm_ch = 16 + ifm_dim = 5 + ishape = (1, ifm_ch, ifm_dim, ifm_dim) + if scalar_param: + pshape = (1,) + else: + pshape = (1, ifm_ch, 1, 1) + + np.random.seed(0) + model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) + + # Since the aren't Data types with a bit width of a non power of 2, + # there are cases where the input won't use it full range. + if idt == DataType.INT32: + x = gen_finn_dt_tensor(DataType.INT16, (1, ifm_ch, ifm_dim, ifm_dim)) + elif idt == DataType.UINT32: + x = gen_finn_dt_tensor(DataType.UINT16, (1, ifm_ch, ifm_dim, ifm_dim)) + else: + x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim)) + + input_dict = prepare_inputs(x) + y_expected = oxe.execute_onnx(model, input_dict)["outp"] + + new_model = model.transform(to_hls.InferChannelwiseLinearLayer()) + new_model = new_model.transform(GiveUniqueNodeNames()) + + if exec_mode == "cppsim": + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + new_model = new_model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(ReplaceVerilogRelPaths()) + new_model = new_model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + ctx_produced = oxe.execute_onnx( + new_model, input_dict, return_full_exec_context=True + ) + y_produced = ctx_produced["outp"] + + assert (y_produced == y_expected).all() + assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch" diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py index ee65326ec57fb7fa7fa0490a8980dbabb8efc13c..22c356a5869b25fcc7ae3ef0164ed61b53ef232c 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -5,10 +5,15 @@ import pytest from finn.core.datatype import DataType from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes -from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames -from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) import finn.core.onnx_exec as oxe from finn.core.modelwrapper import ModelWrapper from finn.util.basic import gen_finn_dt_tensor @@ -17,47 +22,40 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.custom_op.im2col import compute_conv_output_dim +# conv_config kernel_size,stride, pad -@pytest.mark.parametrize("padding", [True, False]) -@pytest.mark.parametrize("kernel_size", [3, 5]) + +@pytest.mark.parametrize( + "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)] +) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(padding, kernel_size): - - assert ( - kernel_size % 2 != 0 - ), """test_convert_to_hls_conv_layer test only - supports odd kernel_size""" - +def test_convert_to_hls_conv_layer(conv_config, exec_mode): + kernel_size, stride, pad = conv_config np.random.seed(0) - padding = True idt = DataType.UINT4 in_feature_dim = 7 - in_chn = 3 + in_chn = 16 + out_chn = 20 - stages = 1 # just one convolution - - out_feature_dim = ( - in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages - ) + out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) input_shape = [1, in_chn, in_feature_dim, in_feature_dim] - output_shape = [1, in_chn, out_feature_dim, out_feature_dim] + output_shape = [1, out_chn, out_feature_dim, out_feature_dim] - conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size] + conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size] + conv_weight_dt = DataType.UINT4 conv_config = {} conv_config["dilations"] = [1, 1] conv_config["group"] = 1 conv_config["kernel_shape"] = [kernel_size, kernel_size] - if padding: - pad = kernel_size // 2 - conv_config["pads"] = [pad, pad, pad, pad] - else: - conv_config["pads"] = [0, 0, 0, 0] - conv_config["strides"] = [1, 1] + conv_config["pads"] = [pad, pad, pad, pad] + conv_config["strides"] = [stride, stride] top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) @@ -80,27 +78,35 @@ def test_convert_to_hls_conv_layer(padding, kernel_size): model = ModelWrapper(modelproto) model.set_tensor_datatype("top_in", idt) model.set_tensor_datatype("top_out", idt) - model.set_tensor_datatype("p1", DataType.UINT4) + model.set_tensor_datatype("p1", conv_weight_dt) + model.set_initializer("p1", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape)) model = model.transform(InferShapes()) - model.set_initializer( - "p1", np.round(np.random.rand(*conv_param_shape).astype(np.float32) * 16) - ) - - model.set_tensor_datatype(model.graph.input[0].name, idt) - model = model.transform(InferShapes()) - model = model.transform(InferDataLayouts()) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) new_model = new_model.transform(to_hls.InferConvInpGen()) - new_model = new_model.transform(PrepareCppSim()) - new_model = new_model.transform(CompileCppSim()) - new_model = new_model.transform(SetExecMode("cppsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(InferShapes()) + new_model = new_model.transform(InferDataTypes()) + + if exec_mode == "cppsim": + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + new_model = new_model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(ReplaceVerilogRelPaths()) + new_model = new_model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") x = gen_finn_dt_tensor(idt, input_shape) inp_dict = {model.graph.input[0].name: x} assert oxe.compare_execution(model, new_model, inp_dict) + if kernel_size == 1 and stride > 1 and pad == 0: + assert new_model.graph.node[1].op_type == "DownSampler" diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py index 48803c9614f53a3a149c6eaac4289d10086513a5..20e3ee08d7ffdd013a89d26bb71d86ccc554a5b4 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py @@ -51,7 +51,7 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.custom_op.registry import getCustomOp -export_onnx_path_cnv = "test_output_cnv.onnx" +export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx" @pytest.mark.vivado diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py index e261a3114853bf24bdb4c931c46ff92eea4150dd..d77065ad9396d0cc8dd57a39ed823fffcb30ee47 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hls_layers_fc.py @@ -52,8 +52,7 @@ from finn.transformation.streamline.round_thresholds import RoundAndClipThreshol from finn.util.test import get_test_model_trained -export_onnx_path = "test_output_tfc.onnx" -export_onnx_path_cnv = "test_output_cnv.onnx" +export_onnx_path = "test_convert_to_hls_layers_fc.onnx" @pytest.mark.vivado diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f78dcea1a1ce364d0657ad64de7d440d41b822 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py @@ -0,0 +1,160 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from onnx import TensorProto, helper +import numpy as np +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.transformation.general import GiveUniqueNodeNames +from finn.custom_op.registry import getCustomOp +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.infer_shapes import InferShapes + + +def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt): + odt = idt + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim] + ) + + mp_node = helper.make_node( + "MaxPool", + ["inp"], + ["outp"], + kernel_shape=[k, k], + pads=[pad, pad, pad, pad], + strides=[stride, stride], + ) + graph = helper.make_graph( + nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="mp-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model = model.transform(InferShapes()) + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType.UINT4, DataType.INT4]) +# pool configuration: ( k,stride, pad, ifm_dim ) +@pytest.mark.parametrize( + "pool_config", [(3, 2, 0, 5), (3, 2, 1, 5), (2, 2, 0, 8), (5, 2, 2, 7)] +) +# input channels +@pytest.mark.parametrize("ifm_ch", [1, 4, 20]) +# number of out channel computed in parallel +@pytest.mark.parametrize("pe", [1, 4, 20]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# pool type +@pytest.mark.parametrize("op_type", ["MaxPool"]) +@pytest.mark.slow +@pytest.mark.vivado +def test_convert_to_hls_pool_batch(idt, pool_config, ifm_ch, pe, exec_mode, op_type): + k, stride, pad, ifm_dim = pool_config + + if ifm_ch % pe != 0: + pytest.skip("ifm_ch%pe != 0. Skipping") + + if pad != 0 and idt.signed(): + pytest.skip("No support for pal_val != 0. Skipping") + + np.random.seed(0) + ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1) + + x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim)) + # prepare input data + input_dict = prepare_inputs(x) + if op_type == "MaxPool": + model = make_single_maxpool_modelwrapper( + k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt + ) + else: + assert False, "{} is not a supported op_type".format(op_type) + + y_expected = oxe.execute_onnx(model, input_dict)["outp"] + + new_model = model.transform(to_hls.InferPool_Batch()) + new_model = new_model.transform(GiveUniqueNodeNames()) + + if ifm_ch != pe: + new_model = new_model.transform(to_hls.InferConvInpGen()) + # Folding + for n in new_model.graph.node: + if n.op_type == "ConvolutionInputGenerator": + inst = getCustomOp(n) + inst.set_nodeattr("SIMD", pe) + elif n.op_type == "Pool_Batch": + inst = getCustomOp(n) + inst.set_nodeattr("PE", pe) + + if exec_mode == "cppsim": + new_model = new_model.transform(SetExecMode("cppsim")) + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + new_model = new_model.transform(SetExecMode("rtlsim")) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # execute new_model + y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] + assert (y_produced == y_expected).all() + if stride != k: + if pad == 0 or ifm_ch == pe: + assert len(new_model.graph.node) == 4 + else: + assert len(new_model.graph.node) == 5 + else: + assert len(new_model.graph.node) == 1 diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed352e28981552b186bb778b94dcbc07471e14b --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -0,0 +1,156 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) + + +def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): + NumChannels = C.shape[0] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, vecs + [NumChannels]) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, vecs + [NumChannels] + ) + + node_inp_list = ["inp", "const"] + + node = helper.make_node( + "ChannelwiseOp_Batch", + node_inp_list, + ["outp"], + domain="finn", + backend="fpgadataflow", + NumChannels=NumChannels, + Func=func, + PE=pe, + inputDataType=idt.name, + outputDataType=odt.name, + paramDataType=pdt.name, + numInputVectors=vecs, + ) + graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp]) + + model = helper.make_model(graph, producer_name="model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + model.set_tensor_datatype("const", idt) + model.set_initializer("const", C) + return model + + +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType.INT8]) +# input datatype +@pytest.mark.parametrize("idt", [DataType.INT4]) +# param datatype +@pytest.mark.parametrize("pdt", [DataType.INT4]) +# folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2]) +# number of input features +@pytest.mark.parametrize("ich", [16]) +# vecs +@pytest.mark.parametrize("vecs", [[1], [1, 7, 7]]) +# function +@pytest.mark.parametrize("func", ["add", "mul"]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_mode): + if nf == -1: + nf = ich + pe = ich // nf + assert ich % pe == 0 + + # generate input and param data + x = gen_finn_dt_tensor(idt, tuple(vecs + [ich])) + # C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32) + C = gen_finn_dt_tensor(pdt, (ich)) + + odt = act + + model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs) + + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # package input data as dictionary + input_dict = {"inp": x} + + oshape = model.get_tensor_shape("outp") + + C_reshaped = np.broadcast_to(C.flatten(), x.shape) + if func == "add": + y = x + C_reshaped + elif func == "mul": + y = x * C_reshaped + + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim failed" + + if exec_mode == "rtlsim": + hls_synt_res_est = model.analysis(hls_synth_res_estimation) + assert "ChannelwiseOp_Batch_0" in hls_synt_res_est diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 9d6390b2673e5d2c0e72748183ac04ed222d078e..5ff3da87228a2a32a41226bb46e0b16b1a44df50 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -23,7 +23,7 @@ test_fpga_part = pynq_part_map[test_pynq_board] target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style): assert pad_style == 2, "only pad_style == 2 supported in hlslib" assert padding > 0, "Output dim should be greater than input dim" odim = idim + padding @@ -47,6 +47,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): inputDataType=str(idt.name), PaddingStyle=pad_style, numInputVectors=1, + SIMD=simd, ) graph = helper.make_graph( @@ -63,11 +64,13 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): # input image dimension -@pytest.mark.parametrize("idim", [8, 16]) +@pytest.mark.parametrize("idim", [8]) # number of rows and number of cols to add @pytest.mark.parametrize("pad", [2, 3]) # number of channels -@pytest.mark.parametrize("num_ch", [1, 2]) +@pytest.mark.parametrize("num_ch", [2, 4]) +# Input parallelism +@pytest.mark.parametrize("simd", [1, 2]) # PaddingStyle: selects behavior when (odim-idim)%2 != 0 @pytest.mark.parametrize("pad_style", [2]) # FINN input datatype @@ -76,14 +79,15 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode): - +def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode): + if num_ch % simd != 0: + pytest.skip(" num_ch % simd != 0, skipping") # generate input data x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) input_dict = {"inp": x} odim = idim + pad - model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style) + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index b830693c32afe629dd6fc70868d0bddacac4c887..a9f5bf5ffa1f816b82ef701800e92249056b7c74 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -54,6 +54,10 @@ from finn.util.basic import gen_finn_dt_tensor, pynq_part_map from finn.util.fpgadataflow import pyverilate_stitched_ip from finn.util.test import load_test_checkpoint_or_skip from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA +from finn.transformation.fpgadataflow.floorplan import Floorplan + test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") test_fpga_part = pynq_part_map[test_pynq_board] @@ -390,3 +394,19 @@ def test_fpgadataflow_ipstitch_remote_execution(): assert np.isclose(outp["outp"], x).all() except KeyError: pytest.skip("PYNQ board IP address not specified") + + +def test_fpgadataflow_ipstitch_iodma_floorplan(): + model = create_one_fc_model() + if model.graph.node[0].op_type == "StreamingDataflowPartition": + sdp_node = getCustomOp(model.graph.node[0]) + assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" + assert os.path.isfile(sdp_node.get_nodeattr("model")) + model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) + model = model.transform(InferDataLayouts()) + model = model.transform(InsertIODMA()) + model = model.transform(Floorplan()) + assert getCustomOp(model.graph.node[0]).get_nodeattr("partition_id") == 0 + assert getCustomOp(model.graph.node[1]).get_nodeattr("partition_id") == 2 + assert getCustomOp(model.graph.node[2]).get_nodeattr("partition_id") == 1 + model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_iodma_floorplan.onnx") diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index 38f792ed3cdd52044b28b4c19ac0603da4e502e6..398a17132a2ef6c92e600102ff5c0b71a1f65aaa 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -92,7 +92,7 @@ def test_res_estimate(): model = model.transform(GiveUniqueNodeNames()) prod_resource_estimation = model.analysis(res_estimation) expect_resource_estimation = { - "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, "LUT": 304.4} + "StreamingFCLayer_Batch_0": {"BRAM_18K": 1, 'BRAM_efficiency': 0.001736111111111111, "LUT": 304.4} } assert check_two_dict_for_equality( diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py index 66a93a190061e0142637be19bb2ea841d192745a..3b6ea86741b8adefce4faaa65b791f1d213cf3ae 100644 --- a/tests/pynq/test_pynq_performance_end2end.py +++ b/tests/pynq/test_pynq_performance_end2end.py @@ -10,7 +10,7 @@ from finn.core.throughput_test import throughput_test build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] -@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"]) +@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1", "cnv_w2a2"]) @pytest.mark.slow def test_pynq_performance_end2end(end2end_example): model = load_test_checkpoint_or_skip( diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py index 56dcd26076ec0a5fba6e9be6acac7f5e13572c3d..103967dfb6b86cc6e2ce2bc9ab78249d8945d47d 100644 --- a/tests/transformation/streamline/test_streamline_cnv.py +++ b/tests/transformation/streamline/test_streamline_cnv.py @@ -44,9 +44,9 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat export_onnx_path = make_build_dir("test_streamline_cnv_") # act bits -@pytest.mark.parametrize("abits", [1]) +@pytest.mark.parametrize("abits", [1, 2]) # weight bits -@pytest.mark.parametrize("wbits", [1]) +@pytest.mark.parametrize("wbits", [1, 2]) # network topology / size @pytest.mark.parametrize("size", ["CNV"]) def test_streamline_cnv(size, wbits, abits): @@ -74,6 +74,7 @@ def test_streamline_cnv(size, wbits, abits): # model.save("orig_cnv.onnx") model = model.transform(Streamline()) # model.save("streamlined_cnv.onnx") + assert len(model.graph.node) == 23 produced_ctx = oxe.execute_onnx(model, input_dict, True) produced = produced_ctx[model.graph.output[0].name] assert np.isclose(expected, produced, atol=1e-3).all() diff --git a/tests/transformation/test_absorb_mul_into_topk.py b/tests/transformation/test_absorb_mul_into_topk.py new file mode 100644 index 0000000000000000000000000000000000000000..1394220f7c336ccea8fe9c494734c4175bf2e847 --- /dev/null +++ b/tests/transformation/test_absorb_mul_into_topk.py @@ -0,0 +1,108 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.transformation.insert_topk import InsertTopK +from finn.transformation.streamline.absorb import AbsorbScalarMulIntoTopK +import finn.core.onnx_exec as oxe + +# parameter to indicate if mul parameter is negative or positive +@pytest.mark.parametrize("mul_positive", [True, False]) +# parameter to indicate if mul parameter is scalar or not +@pytest.mark.parametrize("scalar", [True, False]) +def test_absorb_mul_into_topk(mul_positive, scalar): + if scalar is True: + shape = [1] + else: + shape = [1, 1, 1, 1000] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 1, 1, 1000]) + a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, 1000]) + + mul_node = helper.make_node("Mul", ["inp", "a0"], ["outp"]) + mul_graph = helper.make_graph( + nodes=[mul_node], + name="mul-graph", + inputs=[inp], + outputs=[outp], + value_info=[a0], + ) + + model = helper.make_model(mul_graph, producer_name="mul_model") + model = ModelWrapper(model) + # initialize values + if mul_positive is True: + a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype( + np.float32 + ) + else: + a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype( + np.float32 + ) + model.set_initializer("a0", a0_values) + model = model.transform(InsertTopK()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model_transformed = model.transform(AbsorbScalarMulIntoTopK()) + + # compare execution results + inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype( + np.float32 + ) + idict = {"global_in": inp_values} + odict = oxe.execute_onnx(model, idict, True) + y_indices = odict["global_out"] + y_values = odict["TopK_0_out0"] + odict = oxe.execute_onnx(model_transformed, idict, True) + y_tr_indices = odict["global_out"] + y_tr_values = odict["TopK_0_out0"] + + # the indices stay the same, if the model is transformed or not + assert (y_indices == y_tr_indices).all() + + if scalar is True and mul_positive is True: + # the values change if the model was transformed + assert (y_values != y_tr_values).all() + + # check for new order + assert model.graph != model_transformed.graph + assert len(model.graph.node) - 1 == len(model_transformed.graph.node) + assert model_transformed.graph.node[0].op_type == "TopK" + + else: + assert (y_values == y_tr_values).all() + assert model.graph == model_transformed.graph diff --git a/tests/transformation/test_absorb_transp_into_flatten.py b/tests/transformation/test_absorb_transp_into_flatten.py new file mode 100644 index 0000000000000000000000000000000000000000..fbfa15277717c554da01e38608601997407803b2 --- /dev/null +++ b/tests/transformation/test_absorb_transp_into_flatten.py @@ -0,0 +1,99 @@ +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.modelwrapper import ModelWrapper +import finn.core.data_layout as DataLayout +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten +import finn.core.onnx_exec as oxe + +# permutation of transpose node +@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]]) +# reshape or flatten +@pytest.mark.parametrize("shape", [None, [1, -1], [-1, 1]]) +# input shape +@pytest.mark.parametrize("ishape", [[1, 1, 1, 4], [2, 4, 1, 1], [1, 2, 2, 4]]) +# datalayout +@pytest.mark.parametrize("data_layout", ["NCHW", "NHWC"]) +def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout): + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) + transp_node = helper.make_node("Transpose", ["inp"], ["transp_out"], perm=perm) + dummy_in = np.random.uniform(low=0, high=1, size=tuple(ishape)).astype(np.float32) + if shape is None: + shape_node = helper.make_node("Flatten", ["transp_out"], ["outp"]) + dummy_in = dummy_in.transpose(tuple(perm)) + oshape = dummy_in.reshape(dummy_in.shape[0], -1).shape + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape) + shape0 = None + else: + shape0 = helper.make_tensor_value_info("shape0", TensorProto.FLOAT, shape) + shape_node = helper.make_node("Reshape", ["transp_out", "shape0"], ["outp"]) + oshape = dummy_in.transpose(tuple(perm)).reshape(tuple(shape)).shape + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape) + + graph = helper.make_graph( + nodes=[transp_node, shape_node], + name="absorb-transpose-graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="absorb_transpose_model") + model = ModelWrapper(model) + if shape is not None: + model.graph.value_info.append(shape0) + model.set_initializer("shape0", np.asarray(shape)) + if data_layout == "NCHW": + model.set_tensor_layout("inp", DataLayout.NCHW) + else: + model.set_tensor_layout("inp", DataLayout.NHWC) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save("test.onnx") + model_transformed = model.transform(AbsorbTransposeIntoFlatten()) + model_transformed.save("test2.onnx") + + # verify transformation + inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype( + np.float32 + ) + idict = {model.graph.input[0].name: inp_values} + assert oxe.compare_execution(model, model_transformed, idict) + + # only some of the parameter combinations lead to a graph that will be changed when + # AbsorbTransposeIntoFlatten is applied + + if shape == [-1, 1]: # not a flatten operation, so the graph will not be changed + assert model.graph == model_transformed.graph + + elif perm == [ + 3, + 2, + 0, + 1, + ]: # the first dimension is also part of the transpose operation + # so the graph will not be changed + assert model.graph == model_transformed.graph + + # the following cases are the ones in which the model is transformed + # because we tested the parameters shape and perm befire we can only consider ishape + # and data_layout (the transformed model should only contain a "Flatten" node) + elif ishape == [1, 1, 1, 4] and data_layout == "NHWC": + assert model_transformed.graph.node[0].op_type == "Flatten" + + elif ishape == [2, 4, 1, 1] and data_layout == "NCHW" and shape is None: + # If the first dimension of the input tensor is not 1, flatten and + # reshape (with shape = [1, -1]) would lead to different results + assert model_transformed.graph.node[0].op_type == "Flatten" + + # all other cases lead to an unchanged model + else: + assert model.graph == model_transformed.graph diff --git a/tests/transformation/test_change_datalayout.py b/tests/transformation/test_change_datalayout.py new file mode 100644 index 0000000000000000000000000000000000000000..66459d574957575e61ec1bec631fb7030a27cca1 --- /dev/null +++ b/tests/transformation/test_change_datalayout.py @@ -0,0 +1,112 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest +from onnx import helper, TensorProto + +from finn.custom_op.maxpoolnhwc import compute_pool_output_dim +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +import finn.core.data_layout as DataLayout +from finn.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.util.basic import gen_finn_dt_tensor +from finn.util.basic import get_by_name +import finn.core.onnx_exec as oxe + +# stride +@pytest.mark.parametrize("s", [1, 2]) +# kernel +@pytest.mark.parametrize("k", [3, 4]) +# ibits +@pytest.mark.parametrize("ibits", [4, 8]) +# obits +@pytest.mark.parametrize("obits", [2, 4]) +# signed +@pytest.mark.parametrize("signed", [False, True]) +# channels +@pytest.mark.parametrize("c", [2, 3]) +# input dimension +@pytest.mark.parametrize("idim", [6, 7]) +def test_change_datalayout_quantavgpool(s, k, ibits, obits, signed, c, idim): + n = 1 + odim = compute_pool_output_dim(idim, k, s) + # determine input FINN datatype + if signed is True: + prefix = "INT" + else: + prefix = "UINT" + dt_name = prefix + str(ibits) + dtype = DataType[dt_name] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [n, c, idim, idim]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [n, c, odim, odim]) + + node = helper.make_node( + "QuantAvgPool2d", + ["inp"], + ["outp"], + domain="finn", + stride=s, + kernel=k, + ibits=ibits, + obits=obits, + signed=signed, + data_layout="NCHW", + ) + graph = helper.make_graph( + nodes=[node], name="single-quantavgpool", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph) + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model_transformed = model.transform(ChangeDataLayoutQuantAvgPool2d()) + model_transformed = model_transformed.transform(InferShapes()) + model_transformed = model_transformed.transform(InferDataTypes()) + model_transformed = model_transformed.transform(InferDataLayouts()) + model_transformed = model_transformed.transform(GiveUniqueNodeNames()) + model_transformed = model_transformed.transform(GiveReadableTensorNames()) + inp_values = gen_finn_dt_tensor(dtype, [n, c, idim, idim]) + idict = {"inp": inp_values} + assert oxe.compare_execution(model, model_transformed, idict) + assert len(model.graph.node) + 2 == len(model_transformed.graph.node) + assert model_transformed.graph.node[-1].op_type == "Transpose" + assert model_transformed.graph.node[0].op_type == "Transpose" + # check if QuantAvgPool2d node has datalayout set correctly + node = model_transformed.graph.node[1] + d_layout = get_by_name(node.attribute, "data_layout").s.decode("UTF-8") + assert d_layout == "NHWC" + assert model_transformed.get_tensor_layout(node.input[0]) == DataLayout.NHWC + assert model_transformed.get_tensor_layout(node.output[0]) == DataLayout.NHWC diff --git a/tests/transformation/test_conv_lowering.py b/tests/transformation/test_conv_lowering.py index 73891ded1b9691c7c48a2075ad6ca4668fcf6bfe..16c574b29b55e314b06661b28e4bb869bd6b7996 100644 --- a/tests/transformation/test_conv_lowering.py +++ b/tests/transformation/test_conv_lowering.py @@ -41,7 +41,7 @@ from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul from finn.transformation.double_to_single_float import DoubleToSingleFloat import finn.core.onnx_exec as oxe -export_onnx_path = "test_output_cnv.onnx" +export_onnx_path = "test_conv_lowering.onnx" def test_conv_lowering_cnv_w1a1(): diff --git a/tests/transformation/test_fold_constants.py b/tests/transformation/test_fold_constants.py index 685c14a98b9031096aaf5b244c4f484d4f308bca..a976ffd62bce744a474a6fac2a61a6478526777f 100644 --- a/tests/transformation/test_fold_constants.py +++ b/tests/transformation/test_fold_constants.py @@ -40,7 +40,7 @@ from finn.transformation.fold_constants import FoldConstants from finn.transformation.infer_shapes import InferShapes from finn.util.test import get_test_model_untrained -export_onnx_path = "test_output_lfc.onnx" +export_onnx_path = "test_fold_constants.onnx" def test_const_folding(): diff --git a/tests/transformation/test_infer_data_layouts.py b/tests/transformation/test_infer_data_layouts.py index fccc7813da6f98c8af4ade7ae562c99b32247a8b..d6d9920043114c78e970842aee5955e3150cf526 100644 --- a/tests/transformation/test_infer_data_layouts.py +++ b/tests/transformation/test_infer_data_layouts.py @@ -44,7 +44,7 @@ import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls from finn.transformation.infer_data_layouts import InferDataLayouts import finn.core.data_layout as DataLayout -export_onnx_path_cnv = "test_output_cnv.onnx" +export_onnx_path_cnv = "test_infer_data_layouts.onnx" def test_infer_data_layouts(): diff --git a/tests/transformation/test_infer_datatypes.py b/tests/transformation/test_infer_datatypes.py index e3db40289c4318894cf5ad41c2f67b3bff501db9..097ae03f6153843fbb7956a72b38431559d5d0f1 100644 --- a/tests/transformation/test_infer_datatypes.py +++ b/tests/transformation/test_infer_datatypes.py @@ -38,7 +38,7 @@ from finn.transformation.infer_datatypes import InferDataTypes from finn.transformation.infer_shapes import InferShapes from finn.util.test import get_test_model_trained -export_onnx_path = "test_output_lfc.onnx" +export_onnx_path = "test_infer_datatypes.onnx" def test_infer_datatypes(): diff --git a/tests/transformation/test_linear_past_eltwise.py b/tests/transformation/test_linear_past_eltwise.py index b77f59779a5e8559f80e017d13b66bcb67249830..4cff5e5e1d40986a006cc02186fce21a907c2ef1 100644 --- a/tests/transformation/test_linear_past_eltwise.py +++ b/tests/transformation/test_linear_past_eltwise.py @@ -41,7 +41,7 @@ from finn.transformation.double_to_single_float import DoubleToSingleFloat import pytest -export_onnx_path = "test_scalar_past_eltwise.onnx" +export_onnx_path = "test_linear_past_eltwise.onnx" # construct a synthetic graph to test: # topk insertion, topk conversion to hls, add conversion to hls diff --git a/tests/transformation/test_move_chw_add_past_conv.py b/tests/transformation/test_move_chw_add_past_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..b626f7e5b8564739ec383aaddfc262d642bf47cc --- /dev/null +++ b/tests/transformation/test_move_chw_add_past_conv.py @@ -0,0 +1,109 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import helper, TensorProto + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline.reorder import MoveAddPastConv +from finn.custom_op.im2col import compute_conv_output_dim +import finn.core.onnx_exec as oxe + + +# input dimension +@pytest.mark.parametrize("idim", [4, 7]) +# kernel size +@pytest.mark.parametrize("k", [2, 3]) +# stride +@pytest.mark.parametrize("s", [1, 2]) +# input channels +@pytest.mark.parametrize("ich", [2, 4]) +# output channels +@pytest.mark.parametrize("och", [2, 3]) +def test_move_chw_add_past_conv(idim, k, s, ich, och): + odim = compute_conv_output_dim(idim, k, s) + + ishape = [1, ich, idim, idim] + oshape = [1, och, odim, odim] + add_param_shape = [1, ich, 1, 1] + conv_param_shape = [och, ich, k, k] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape) + a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, add_param_shape) + a1 = helper.make_tensor_value_info("a1", TensorProto.FLOAT, conv_param_shape) + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = 1 + conv_config["kernel_shape"] = [k, k] + conv_config["pads"] = [0, 0, 0, 0] + conv_config["strides"] = [s, s] + + add_node = helper.make_node("Add", ["inp", "a0"], ["add_out"]) + conv_node = helper.make_node("Conv", ["add_out", "a1"], ["outp"], **conv_config) + + model = helper.make_model( + helper.make_graph( + nodes=[add_node, conv_node], + name="move-add-graph", + inputs=[inp], + outputs=[outp], + value_info=[a0, a1], + ) + ) + + model = ModelWrapper(model) + # initialize model + a0_values = np.random.uniform(low=0, high=1, size=tuple(add_param_shape)).astype( + np.float32 + ) + model.set_initializer("a0", a0_values) + a1_values = np.random.uniform(low=0, high=1, size=tuple(conv_param_shape)).astype( + np.float32 + ) + model.set_initializer("a1", a1_values) + + model = model.transform(InferShapes()) + + # execution before transformation + inp_values = np.random.uniform(low=0, high=1, size=tuple(ishape)).astype(np.float32) + idict = {model.graph.input[0].name: inp_values} + odict = oxe.execute_onnx(model, idict) + y_before = odict[model.graph.output[0].name] + + model = model.transform(MoveAddPastConv()) + odict = oxe.execute_onnx(model, idict) + y_after = odict[model.graph.output[0].name] + + assert np.isclose(y_before, y_after).all() + assert model.graph.node[0].op_type == "Conv" + assert model.graph.node[1].op_type == "Add" diff --git a/tests/transformation/test_move_flatten_past_affine.py b/tests/transformation/test_move_flatten_past_affine.py new file mode 100644 index 0000000000000000000000000000000000000000..b2d5e51613d41f3f2db3dabcef7b982ec2816b19 --- /dev/null +++ b/tests/transformation/test_move_flatten_past_affine.py @@ -0,0 +1,106 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +import finn.core.data_layout as DataLayout +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.transformation.streamline.reorder import MoveFlattenPastAffine +import finn.core.onnx_exec as oxe + +# data layout +@pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW]) +# batch size +@pytest.mark.parametrize("batch_size", [1, 2]) +def test_move_flatten_past_affine(data_layout, batch_size): + if data_layout == DataLayout.NHWC: + ishape = [batch_size, 1, 1, 1024] + oshape = [batch_size, 1000] + else: + ishape = [batch_size, 1024, 1, 1] + oshape = [batch_size, 1000] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) + a0 = helper.make_tensor_value_info("a1", TensorProto.FLOAT, [1024, 1000]) + a1 = helper.make_tensor_value_info("a2", TensorProto.FLOAT, []) + a2 = helper.make_tensor_value_info("a3", TensorProto.FLOAT, [1000]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape) + + flatten_node = helper.make_node("Flatten", ["inp"], ["flatten_out"]) + matmul_node = helper.make_node("MatMul", ["flatten_out", "a0"], ["matmul_out"]) + mul_node = helper.make_node("Mul", ["matmul_out", "a1"], ["mul_out"]) + add_node = helper.make_node("Add", ["mul_out", "a2"], ["outp"]) + + graph = helper.make_graph( + nodes=[flatten_node, matmul_node, mul_node, add_node], + name="move-reshape-graph", + inputs=[inp], + outputs=[outp], + value_info=[a0, a1, a2], + ) + + model = helper.make_model(graph, producer_name="move_reshape_model") + model = ModelWrapper(model) + + # initialize values + a0_values = gen_finn_dt_tensor(DataType.TERNARY, [1024, 1000]) + model.set_initializer("a0", a0_values) + a1_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32) + model.set_initializer("a1", a1_values) + a2_values = np.random.uniform(low=-1, high=1, size=(1000)).astype(np.float32) + model.set_initializer("a2", a2_values) + + model.set_tensor_datatype("inp", DataType.INT2) + model.set_tensor_layout("inp", data_layout) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # compare execution before and after transformation + inp_values = gen_finn_dt_tensor(DataType.INT2, ishape) + idict = {model.graph.input[0].name: inp_values} + model_transformed = model.transform(MoveFlattenPastAffine()) + assert oxe.compare_execution(model, model_transformed, idict) + + # depending on data layout check if graph is transformed or not + if data_layout == DataLayout.NHWC: + # check if nodes have new order in transformed graph + assert model.graph != model_transformed.graph + assert model_transformed.graph.node[-1].op_type == "Flatten" + else: + assert model.graph == model_transformed.graph diff --git a/tests/transformation/test_move_flatten_past_topk.py b/tests/transformation/test_move_flatten_past_topk.py new file mode 100644 index 0000000000000000000000000000000000000000..65da92c22dbe9f6b1c5a49172ffae59fa6e98607 --- /dev/null +++ b/tests/transformation/test_move_flatten_past_topk.py @@ -0,0 +1,89 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +from onnx import TensorProto, helper + +from finn.core.modelwrapper import ModelWrapper +from finn.core.datatype import DataType +import finn.core.data_layout as DataLayout +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.insert_topk import InsertTopK +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.transformation.streamline.reorder import MoveFlattenPastTopK +import finn.core.onnx_exec as oxe + +# data layout +@pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW]) +# batch size +@pytest.mark.parametrize("batch_size", [1, 2]) +def test_move_flatten_past_affine(data_layout, batch_size): + if data_layout == DataLayout.NHWC: + ishape = [batch_size, 1, 1, 1024] + oshape = [batch_size, 1024] + else: + ishape = [batch_size, 1024, 1, 1] + oshape = [batch_size, 1024] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape) + + flatten_node = helper.make_node("Flatten", ["inp"], ["outp"]) + + graph = helper.make_graph( + nodes=[flatten_node], name="move-flatten-graph", inputs=[inp], outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="move_flatten_model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", DataType.INT2) + model.set_tensor_layout("inp", data_layout) + model = model.transform(InsertTopK()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # compare execution before and after transformation + inp_values = gen_finn_dt_tensor(DataType.INT2, ishape) + idict = {model.graph.input[0].name: inp_values} + model_transformed = model.transform(MoveFlattenPastTopK()) + assert oxe.compare_execution(model, model_transformed, idict) + + # depending on data layout check if graph is transformed or not + if data_layout == DataLayout.NHWC: + # check if nodes have new order in transformed graph + assert model.graph != model_transformed.graph + assert model_transformed.graph.node[-1].op_type == "Flatten" + else: + assert model.graph == model_transformed.graph diff --git a/tests/transformation/test_move_mul_past_dw_conv.py b/tests/transformation/test_move_mul_past_dw_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..1ae8fbfe89986d58d3d71f5f8735a98469d9d1e3 --- /dev/null +++ b/tests/transformation/test_move_mul_past_dw_conv.py @@ -0,0 +1,93 @@ +import pytest + +from onnx import helper, TensorProto +from finn.custom_op.im2col import compute_conv_output_dim +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.streamline.reorder import MoveMulPastDWConv + + +# input dimension +@pytest.mark.parametrize("ifm_dim", [4, 7]) +# input channels +@pytest.mark.parametrize("ifm_ch", [2, 3]) +# kernel size +@pytest.mark.parametrize("k", [2, 3]) +# stride +@pytest.mark.parametrize("stride", [1, 2]) +# padding +@pytest.mark.parametrize("pad_amt", [0, 1]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw): + if dw == 1: + ofm_ch = ifm_ch + groups = ifm_ch + W_shape = [ofm_ch, 1, k, k] + else: + ofm_ch = ifm_ch + 2 + groups = 1 + W_shape = [ofm_ch, ifm_ch, k, k] + + ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad_amt) + + # set up onnx model + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] + ) + mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [1, ifm_ch, 1, 1]) + W = helper.make_tensor_value_info("W", TensorProto.FLOAT, W_shape) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim] + ) + + Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) + + Conv_node = helper.make_node( + "Conv", + ["mul_out", "W"], + ["outp"], + group=groups, + kernel_shape=[k, k], + pads=[pad_amt, pad_amt, pad_amt, pad_amt], + strides=[stride, stride], + ) + + graph = helper.make_graph( + nodes=[Mul_node, Conv_node], + name="mulpastconv_graph", + inputs=[inp], + outputs=[outp], + value_info=[mul, W], + ) + + model = helper.make_model(graph, producer_name="mulpastconv-model") + model = ModelWrapper(model) + inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim]) + mul_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, 1, 1]) + W_values = gen_finn_dt_tensor(DataType.INT2, W_shape) + model.set_initializer("W", W_values) + model.set_initializer("mul", mul_values) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + idict = {"inp": inp_values} + odict = oxe.execute_onnx(model, idict, True) + out_before = odict["outp"] + + # move channelwise multiplication past depthwise conv + model_transformed = model.transform(MoveMulPastDWConv()) + odict = oxe.execute_onnx(model_transformed, idict, True) + out_after = odict["outp"] + + assert (out_before == out_after).all() + + if dw == 0: + assert model.graph.node[0].op_type == model_transformed.graph.node[0].op_type + assert model.graph.node[1].op_type == model_transformed.graph.node[1].op_type + else: + assert model.graph.node[0].op_type == model_transformed.graph.node[1].op_type + assert model.graph.node[1].op_type == model_transformed.graph.node[0].op_type diff --git a/tests/transformation/test_move_scalar_past_conv.py b/tests/transformation/test_move_scalar_past_conv.py index 0f50642d2b9d1583030630cb4927c2b86667e71a..94fee7907d1ed1cccbf95520e903c7d9b43d8f7d 100644 --- a/tests/transformation/test_move_scalar_past_conv.py +++ b/tests/transformation/test_move_scalar_past_conv.py @@ -7,14 +7,14 @@ import finn.core.onnx_exec as ox from finn.core.modelwrapper import ModelWrapper from finn.transformation.infer_shapes import InferShapes from finn.transformation.streamline import ( - MoveScalarAddPastConv, + MoveAddPastConv, MoveScalarMulPastConv, ) @pytest.mark.parametrize("padding", [False, True]) @pytest.mark.parametrize( - "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())], + "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())], ) def test_move_scalar_past_conv(test_args, padding): scalar_op = test_args[0] @@ -83,8 +83,8 @@ def test_move_scalar_past_conv(test_args, padding): assert new_model.graph.node[2].op_type == "Conv" else: assert new_model.graph.node[0].op_type == "Conv" - assert new_model.graph.node[1].op_type == scalar_op - assert new_model.graph.node[2].op_type == "Conv" + assert new_model.graph.node[1].op_type == "Conv" + assert new_model.graph.node[2].op_type == scalar_op else: assert new_model.graph.node[0].op_type == "Conv" assert new_model.graph.node[1].op_type == "Conv" @@ -92,7 +92,7 @@ def test_move_scalar_past_conv(test_args, padding): @pytest.mark.parametrize( - "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())], + "test_args", [("Add", MoveAddPastConv()), ("Mul", MoveScalarMulPastConv())], ) def test_move_scalar_past_conv_only_if_linear(test_args): scalar_op = test_args[0] diff --git a/tests/transformation/test_move_transpose_past_scalar_mul.py b/tests/transformation/test_move_transpose_past_scalar_mul.py new file mode 100644 index 0000000000000000000000000000000000000000..e434fc7d4f683120176e18a2bfa9da99d9ee0b0e --- /dev/null +++ b/tests/transformation/test_move_transpose_past_scalar_mul.py @@ -0,0 +1,82 @@ +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.modelwrapper import ModelWrapper +import finn.core.data_layout as DataLayout +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.transformation.streamline.reorder import MoveTransposePastScalarMul +import finn.core.onnx_exec as oxe + +# permutation of transpose node +@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 1, 3, 2], [3, 2, 0, 1]]) +# scalar mul +@pytest.mark.parametrize("scalar", [True, False]) +# data layout +@pytest.mark.parametrize("data_layout", [None, DataLayout.NHWC, DataLayout.NCHW]) +def test_move_transpose_past_scalar_mul(perm, scalar, data_layout): + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 2, 3, 4]) + # to determine out_size we need to calculate with "perm" for this test case + dummy_in = np.random.uniform(low=0, high=1, size=(1, 2, 3, 4)).astype(np.float32) + out_size = dummy_in.transpose(tuple(perm)).shape + + if scalar is True: + a0_size = [] + else: + a0_size = out_size + a0 = helper.make_tensor_value_info("a0", TensorProto.FLOAT, a0_size) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, out_size) + transp_node = helper.make_node("Transpose", ["inp"], ["transp_out"], perm=perm) + mul_node = helper.make_node("Mul", ["transp_out", "a0"], ["outp"]) + + graph = helper.make_graph( + nodes=[transp_node, mul_node], + name="mv-transpose-graph", + inputs=[inp], + outputs=[outp], + value_info=[a0], + ) + + model = helper.make_model(graph, producer_name="mv_transpose_model") + model = ModelWrapper(model) + + # initialize values + a0_values = np.random.uniform(low=0, high=1, size=tuple(a0_size)).astype(np.float32) + model.set_initializer("a0", a0_values) + if data_layout is not None: + model.set_tensor_layout("inp", data_layout) + model = model.transform(InferDataLayouts()) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # compare execution before and after transformation + inp_values = np.random.uniform(low=0, high=1, size=(1, 2, 3, 4)).astype(np.float32) + idict = {model.graph.input[0].name: inp_values} + model_transformed = model.transform(MoveTransposePastScalarMul()) + assert oxe.compare_execution(model, model_transformed, idict) + + # check if order changed + if scalar is True and data_layout is not None: + assert model_transformed.graph.node[0] != model.graph.node[0] + assert model_transformed.graph.node[1] != model.graph.node[1] + assert model_transformed.graph.node[0].op_type == "Mul" + assert model_transformed.graph.node[1].op_type == "Transpose" + mul_input = model_transformed.graph.node[0].input[0] + mul_output = model_transformed.graph.node[0].output[0] + assert model_transformed.get_tensor_layout(mul_input) == data_layout + assert model_transformed.get_tensor_layout(mul_output) == data_layout + else: + assert model_transformed.graph.node[0] == model.graph.node[0] + assert model_transformed.graph.node[1] == model.graph.node[1] + if data_layout is not None: + mul_input = model_transformed.graph.node[1].input[0] + mul_output = model_transformed.graph.node[1].output[0] + assert model_transformed.get_tensor_layout(mul_input) != data_layout + assert model_transformed.get_tensor_layout(mul_output) != data_layout diff --git a/tests/transformation/test_remove_identity_ops.py b/tests/transformation/test_remove_identity_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..536c1ab0b48fa44388da23f45b528da3c5f3b2f2 --- /dev/null +++ b/tests/transformation/test_remove_identity_ops.py @@ -0,0 +1,81 @@ +import pytest + +import numpy as np +from onnx import helper, TensorProto +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline.remove import RemoveIdentityOps +from finn.util.basic import gen_finn_dt_tensor + + +def insert_identity_op(model, op): + if op in ["Add", "Sub"]: + val = np.asarray([0.0], dtype=np.float32) + elif op in ["Mul", "Div"]: + val = np.asarray([1.0], dtype=np.float32) + else: + return + + identity_node = helper.make_node(op, ["div_out", "value"], ["ident_out"]) + graph = model.graph + graph.node.insert(3, identity_node) + graph.node[-1].input[0] = "ident_out" + model.set_initializer("value", val) + + return model + + +# identity operations to be inserted +@pytest.mark.parametrize("op", ["Add", "Sub", "Mul", "Div"]) +def test_remove_identity_ops(op): + + # set up onnx model + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 4, 1, 1]) + mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, []) + shape = helper.make_tensor_value_info("shape", TensorProto.FLOAT, [2]) + div = helper.make_tensor_value_info("div", TensorProto.FLOAT, []) + matmul = helper.make_tensor_value_info("matmul", TensorProto.FLOAT, [4, 2]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 2]) + + mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) + reshape_node = helper.make_node("Reshape", ["mul_out", "shape"], ["reshape_out"]) + div_node = helper.make_node("Div", ["reshape_out", "div"], ["div_out"]) + matmul_node = helper.make_node("MatMul", ["div_out", "matmul"], ["outp"]) + + graph = helper.make_graph( + nodes=[mul_node, reshape_node, div_node, matmul_node], + name="identity-graph", + inputs=[inp], + outputs=[outp], + value_info=[mul, shape, div, matmul], + ) + + model = helper.make_model(graph, producer_name="mulpastconv-model") + model = ModelWrapper(model) + inp_values = gen_finn_dt_tensor(DataType.INT2, [1, 4, 1, 1]) + mul_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32) + shape_values = np.asarray([1, -1], dtype=np.int64) + div_values = np.random.uniform(low=0.1, high=0.99, size=(1)).astype(np.float32) + matmul_values = gen_finn_dt_tensor(DataType.INT2, [4, 2]) + model.set_initializer("mul", mul_values) + model.set_initializer("shape", shape_values) + model.set_initializer("div", div_values) + model.set_initializer("matmul", matmul_values) + insert_identity_op(model, op) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + idict = {"inp": inp_values} + odict = oxe.execute_onnx(model, idict) + out_before = odict["outp"] + num_of_nodes_before = len(model.graph.node) + + model = model.transform(RemoveIdentityOps()) + num_of_nodes_after = len(model.graph.node) + assert num_of_nodes_before - 1 == num_of_nodes_after + + odict = oxe.execute_onnx(model, idict) + out_after = odict["outp"] + assert (out_before == out_after).all() diff --git a/tests/transformation/test_sign_to_thres.py b/tests/transformation/test_sign_to_thres.py index b10840df37a695986e54c0bdaa68baa0538f90f2..a92f839e5f6ca8b45eadf939fa35973ac153e0b1 100644 --- a/tests/transformation/test_sign_to_thres.py +++ b/tests/transformation/test_sign_to_thres.py @@ -40,8 +40,7 @@ from finn.transformation.infer_shapes import InferShapes from finn.transformation.streamline import ConvertSignToThres from finn.util.test import get_test_model_trained -export_onnx_path = "test_output_lfc.onnx" -transformed_onnx_path = "test_output_lfc_transformed.onnx" +export_onnx_path = "test_sign_to_thres.onnx" def test_sign_to_thres(): diff --git a/tests/transformation/test_topk_insert.py b/tests/transformation/test_topk_insert.py index 1af0f255d8fb1af8a6e571518f18d831aa71298b..a18e63384150f140cb63ec7b438283eb4797266c 100644 --- a/tests/transformation/test_topk_insert.py +++ b/tests/transformation/test_topk_insert.py @@ -18,7 +18,7 @@ from pkgutil import get_data import pytest -export_onnx_path = "test_output_lfc.onnx" +export_onnx_path = "test_topk_insert.onnx" @pytest.mark.parametrize("k", [1, 5, 10])