diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml new file mode 100644 index 0000000000000000000000000000000000000000..cd59a629405c748187cdf478c0bdb0694c58c79f --- /dev/null +++ b/.github/workflows/quicktest-dev-pr.yml @@ -0,0 +1,21 @@ +name: QuicktestPRAgainstDev + +on: + pull_request: + branches: [ dev ] + push: + branches: [ dev ] + + +jobs: + + test: + name: Run quicktest on PR branch + runs-on: ubuntu-latest + + steps: + - name: checkout + uses: actions/checkout@v2 + + - name: DockerRunQuicktest + run: sh run-docker.sh quicktest diff --git a/AUTHORS.rst b/AUTHORS.rst index e231e61d38991e11e2e43a7c9a3a78c50c878244..eb1e06e54b7eb6deedd3e7f8392bb3aa257e7dc6 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -6,3 +6,5 @@ Contributors * Jakoba Petri-Koenig (@auphelia) * Andrea Rigoni (@AndreaRigoni) * Hendrik Borras (@HenniOVP) +* Lucian Petrica (@quetric) +* Tobias Alonso (@Tobi-Alonso) diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci index 0d610ec66a5f433d156f4e8da976767ce6458aef..2668927602ebb8de5fdc3d7c25b20a0c8c4a2e55 100644 --- a/docker/Dockerfile.finn_ci +++ b/docker/Dockerfile.finn_ci @@ -47,7 +47,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas # CNPY RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy # FINN hlslib -RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib +RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib # PyVerilator RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator # PYNQ-HelloWorld diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev index 1c2cb19d14137b866b55417522fdebb8e0d7ad90..1200c7d5d15bbd62e15f19f84e70d5fe0b8aca28 100644 --- a/docker/Dockerfile.finn_dev +++ b/docker/Dockerfile.finn_dev @@ -76,7 +76,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas # CNPY RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy # FINN hlslib -RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib +RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib # PyVerilator RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator # PYNQ-HelloWorld diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 30dae7d2bd37516a887ba5ca20c1398af75905f3..ef6b65cedf12cd07df391c5045519d11f7ff6db0 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -15,7 +15,7 @@ gecho () { # the repos themselves are cloned in the Dockerfile BREVITAS_COMMIT=093de7d138c6715dbcaf82a9e1d530069327ad98 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 -HLSLIB_COMMIT=6b88db826bb023937506913a23d964775a7606af +HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716 PYVERILATOR_COMMIT=1d89cb0d4e0c97469cc6352c611f876ec13edfa6 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d diff --git a/requirements.txt b/requirements.txt index 2427f9490a3dd5a7ffe0e0a8cf2ad19af0934cdf..6b8e4d02c8ca1dcdbe607aabdccd27cec8056332 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ pre-commit pyverilator scipy sphinx +toposort wget diff --git a/run-docker.sh b/run-docker.sh index e1f17e728204217ff3caa6e486b2daae16d6d271..e07556716db335421f57a390f1e6a17168ac058b 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -96,6 +96,8 @@ gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT" gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE" gecho "Using default PYNQ board $PYNQ_BOARD" +DOCKER_INTERACTIVE="" + if [ "$1" = "test" ]; then gecho "Running test suite (all tests)" DOCKER_CMD="python setup.py test" @@ -108,6 +110,7 @@ elif [ "$1" = "notebook" ]; then else gecho "Running container only" DOCKER_CMD="bash" + DOCKER_INTERACTIVE="-it" fi # Build the FINN Docker image @@ -123,7 +126,7 @@ docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \ # Launch container with current directory mounted # important to pass the --init flag here for correct Vivado operation, see: # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins -docker run -t --rm --name $DOCKER_INST_NAME -it --init \ +docker run -t --rm --name $DOCKER_INST_NAME $DOCKER_INTERACTIVE --init \ --hostname $DOCKER_INST_NAME \ -e "XILINX_VIVADO=$VIVADO_PATH" \ -e "SHELL=/bin/bash" \ diff --git a/src/finn/core/modelwrapper.py b/src/finn/core/modelwrapper.py index cdf99dc3bd8b698bec60d79ef6e34640ac3b740c..646add188c5d475cf37ccd33cf24d29d61754ae1 100644 --- a/src/finn/core/modelwrapper.py +++ b/src/finn/core/modelwrapper.py @@ -259,11 +259,10 @@ class ModelWrapper: def find_producer(self, tensor_name): """Finds and returns the node that produces the tensor with given name.""" - ret = None for x in self._model_proto.graph.node: if tensor_name in x.output: - ret = x - return ret + return x + return None def find_upstream(self, tensor_name, finder_fxn): """Follow the producer chain upstream, calling finder_fxn on each upstream @@ -333,6 +332,22 @@ class ModelWrapper: else: return None + def is_fork_node(self, node): + """Checks if the given node is a fork, that is, the node has multiple + direct successors""" + direct_successors = self.find_direct_successors(node) + is_fork = False if direct_successors is None else (len(direct_successors) > 1) + return is_fork + + def is_join_node(self, node): + """Checks if the given node is a join, that is, the node has multiple + direct predecessors""" + direct_predecessors = self.find_direct_predecessors(node) + is_join = ( + False if direct_predecessors is None else (len(direct_predecessors) > 1) + ) + return is_join + def get_all_tensor_names(self): """Returns a list of all (input, output and value_info) tensor names in the graph.""" @@ -494,3 +509,41 @@ class ModelWrapper: qa.tensor_name = tensor_name qa.quant_parameter_tensor_names.append(dt) qnt_annotations.append(qa) + + def get_tensor_sparsity(self, tensor_name): + """Returns the sparsity of a given tensor as dictionary.""" + graph = self._model_proto.graph + qnt_annotations = graph.quantization_annotation + ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name") + if ret is not None: + ret = util.get_by_name( + ret.quant_parameter_tensor_names, "tensor_sparsity", "key" + ) + if ret is not None: + return eval(ret.value) + return None + + def set_tensor_sparsity(self, tensor_name, sparsity_dict): + """Sets the sparsity annotation of a tensor with given name.""" + graph = self._model_proto.graph + qnt_annotations = graph.quantization_annotation + ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name") + if ret is not None: + ret_ts = util.get_by_name( + ret.quant_parameter_tensor_names, "tensor_sparsity", "key" + ) + if ret_ts is not None: + ret_ts.value = str(sparsity_dict) + else: + ts = onnx.StringStringEntryProto() + ts.key = "tensor_sparsity" + ts.value = str(sparsity_dict) + ret.quant_parameter_tensor_names.append(ts) + else: + qa = onnx.TensorAnnotation() + dt = onnx.StringStringEntryProto() + dt.key = "tensor_sparsity" + dt.value = str(sparsity_dict) + qa.tensor_name = tensor_name + qa.quant_parameter_tensor_names.append(dt) + qnt_annotations.append(qa) diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py index 335dfec04e4abee41f914c5d912ce291a0d31a91..a533e4d36629f57f7c4a576570d75a1e051de5be 100644 --- a/src/finn/core/remote_exec.py +++ b/src/finn/core/remote_exec.py @@ -79,6 +79,12 @@ def remote_exec(model, execution_context): bash_command = ["/bin/bash", "-c", cmd] process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() + # remove stale output file from local dir, if any + try: + os.remove("{}/output.npy".format(deployment_dir)) + except FileNotFoundError: + pass + # copy generated output to local cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/output.npy {}".format( pynq_password, pynq_port, diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py index c82d540e29fc59b92a22bf011e823a9f8c076843..8d3dabcf8af51327d5d951464c6d9b36e2f67497 100644 --- a/src/finn/core/throughput_test.py +++ b/src/finn/core/throughput_test.py @@ -30,10 +30,11 @@ import os import subprocess -def throughput_test(model): +def throughput_test(model, batchsize=1000): """Runs the throughput test for the given model remotely on the pynq board. The metadata properties related to the pynq board have to be set. - Returns a dictionary with results of the throughput test""" + Returns a dictionary with results of the throughput test. Returns None + if the test fails.""" pynq_ip = model.get_metadata_prop("pynq_ip") pynq_port = int(model.get_metadata_prop("pynq_port")) @@ -47,7 +48,8 @@ def throughput_test(model): cmd = ( "sshpass -p {} ssh {}@{} -p {} " '"cd {}/{}; echo "{}" | ' - 'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=1000"' + 'sudo -S python3.6 driver.py --exec_mode="throughput_test" --batchsize=%d"' + % batchsize ).format( pynq_password, pynq_username, @@ -61,6 +63,12 @@ def throughput_test(model): process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() + # remove any pre-existing metrics file + try: + os.remove("{}/nw_metrics.txt".format(deployment_dir)) + except FileNotFoundError: + pass + cmd = "sshpass -p {} scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format( pynq_password, pynq_port, @@ -74,7 +82,9 @@ def throughput_test(model): process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() - with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file: - res = eval(file.read()) - - return res + try: + with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file: + res = eval(file.read()) + return res + except FileNotFoundError: + return None diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index a99d62fd18a958d37fef6b3e1939ef97e859b0b2..a688898f4a43b33fd3f07cda12144b84829e451f 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -40,6 +40,7 @@ from finn.util.basic import ( from finn.util.fpgadataflow import ( IPGenBuilder, pyverilate_get_liveness_threshold_cycles, + rtlsim_multi_io, ) from . import templates @@ -109,6 +110,31 @@ class HLSCustomOp(CustomOp): ) return verilog_file + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + assert ( + code_gen_dir != "" + ), """Node attribute "code_gen_dir_ipgen" is + not set. Please run HLSSynthIP first.""" + verilog_path = "{}/project_{}/sol1/impl/verilog/".format( + code_gen_dir, self.onnx_node.name + ) + # default impl only returns the HLS verilog codegen dir + return [verilog_path] + + def get_all_verilog_filenames(self): + "Return list of all Verilog files used for this node." + + verilog_files = [] + verilog_paths = self.get_all_verilog_paths() + for verilog_path in verilog_paths: + for f in os.listdir(verilog_path): + if f.endswith(".v"): + verilog_files += [f] + return verilog_files + def prepare_rtlsim(self): """Creates a Verilator emulation library for the RTL code generated for this node, sets the rtlsim_so attribute to its path and returns @@ -116,24 +142,15 @@ class HLSCustomOp(CustomOp): if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") - # ensure that code is generated - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - assert ( - code_gen_dir != "" - ), """Node attribute "code_gen_dir_ipgen" is - not set. Please run HLSSynthIP first.""" - verilog_file = self.get_verilog_top_filename() - assert os.path.isfile(verilog_file), "Cannot find top-level Verilog file." + verilog_paths = self.get_all_verilog_paths() + verilog_files = self.get_all_verilog_filenames() # build the Verilator emu library sim = PyVerilator.build( - verilog_file, + verilog_files, build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=[ - "{}/project_{}/sol1/impl/verilog/".format( - code_gen_dir, self.onnx_node.name - ) - ], + verilog_path=verilog_paths, trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), ) # save generated lib filename in attribute self.set_nodeattr("rtlsim_so", sim.lib._name) @@ -202,6 +219,7 @@ class HLSCustomOp(CustomOp): self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() + template = self.ipgentcl_template for key in self.code_gen_dict: @@ -217,7 +235,7 @@ class HLSCustomOp(CustomOp): def ipgen_extra_directives(self): "Return a list of extra tcl directives for HLS synthesis." return [] - + def ipgen_singlenode_code(self): """Builds the bash script for ip generation using the IPGenBuilder from finn.util.fpgadataflow.""" @@ -302,14 +320,24 @@ Found no codegen dir for this node, did you run the prepare_cppsim transformatio ) def npy_to_dynamic_output(self, context): - """Reads the output from a .npy file and saves it at the right place in - the context dictionary.""" - # TODO support multi-output nodes as needed + """Reads the output from an output.npy file generated from cppsim and + places its content into the context dictionary.""" node = self.onnx_node code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") output = np.load("{}/output.npy".format(code_gen_dir)) context[node.output[0]] = output + def npy_to_dynamic_outputs(self, context, npy_list): + """Reads the output from .npy files generated from cppsim and places + their content into the context dictionary. + npy_list is a list specifying which files to read, and its order must + match the order of node outputs.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + for i in range(len(npy_list)): + output = np.load("{}/{}".format(code_gen_dir, npy_list[i])) + context[node.output[i]] = output + def exec_precompiled_singlenode_model(self): """Executes precompiled executable.""" executable_path = self.get_nodeattr("executable_path") @@ -405,6 +433,16 @@ compilation transformations? sim.stop_vcd_trace() return outputs + def rtlsim_multi_io(self, sim, io_dict): + "Run rtlsim for this node, supports multiple i/o streams." + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + num_out_values = self.get_number_output_values() + total_cycle_count = rtlsim_multi_io(sim, io_dict, num_out_values, trace_file) + self.set_nodeattr("sim_cycles", total_cycle_count) + def execute_node(self, context, graph): """Executes single node using cppsim or rtlsim.""" mode = self.get_nodeattr("exec_mode") diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index e4d106068d4d128c66b2ce5f3d6c925dfe414b90..3e40ad70208909551365c51324153859ccc79ceb 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -41,10 +41,19 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # output 0 is the output tensor, shape NHWC: # = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels) +# note: the actual data layout produced by the hlslib kernels is different +# for depthwise and non-depthwise ops. +# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD) +# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD) +# see test_fpgadataflow_slidingwindow.py for an example of how to transform +# between the two layouts + class ConvolutionInputGenerator(HLSCustomOp): - """Class that corresponds to finn-hlslib ConvolutionInputGenerator - (sliding window) function.""" + """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator + (sliding window) function variants. Depending on the combination of + attributes (e.g. depthwise or not, whether k % stride is 0) a different + variant will be picked for the actual HLS implementation.""" def __init__(self, onnx_node): super().__init__(onnx_node) @@ -60,6 +69,7 @@ class ConvolutionInputGenerator(HLSCustomOp): # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), "outputDataType": ("s", True, ""), + "depthwise": ("i", False, 0), # FPGA resource type for ConvolutionInputGenerator input buffer # auto -- let Vivado HLS decide # block -- use BRAM @@ -106,7 +116,6 @@ class ConvolutionInputGenerator(HLSCustomOp): pad = 0 ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - assert k % stride == 0, "stride must divide kernel size k" wf = int((k * k * ifm_ch) // simd) folded_oshape = (1, ofm_dim, ofm_dim, wf, simd) return folded_oshape @@ -305,12 +314,35 @@ class ConvolutionInputGenerator(HLSCustomOp): def docompute(self): node = self.onnx_node - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, - OFMDim1, SIMD1, Stride1> (in0, out, numReps);""".format( - node.op_type - ) - ] + ram_style = self.get_nodeattr("ram_style") + map_to_hls_ram_style = { + "auto": "ap_resource_dflt()", + "block": "ap_resource_bram()", + "distributed": "ap_resource_lutram()", + "ultra": "ap_resource_uram()", + } + hls_ram_style = map_to_hls_ram_style[ram_style] + hls_call = node.op_type + # check if non optimized ConvolutionInputGenerator is needed + k = self.get_nodeattr("ConvKernelDim") + stride = self.get_nodeattr("Stride") + if k % stride != 0: + hls_call += "_kernel_stride" + + if self.get_nodeattr("depthwise") == 1: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}_dws<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, + OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1, + OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format( + hls_call, hls_ram_style + ) + ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -356,17 +388,3 @@ class ConvolutionInputGenerator(HLSCustomOp): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) - - def ipgen_extra_directives(self): - # add directive to control input buffer memory resources - ram_style = self.get_nodeattr("ram_style") - map_to_hls_ram_style = { - "auto": "RAM_2P", - "block": "RAM_2P_BRAM", - "distributed": "RAM_2P_LUTRAM", - "ultra": "RAM_2P_URAM", - } - hls_ram_style = map_to_hls_ram_style[ram_style] - directive = "set_directive_resource -core %s " % hls_ram_style - directive += "ConvolutionInputGenerator inputBuf" - return [directive] diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..54051af5e0387081a23e1f8fa77ec9e363098830 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py @@ -0,0 +1,361 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os + +import numpy as np + +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from onnx import TensorProto, helper +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class DuplicateStreams_Batch(HLSCustomOp): + """Class that corresponds to finn-hlslib function of the same name.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ch]) + return ishape + + def get_folded_input_shape(self): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self): + return self.get_normal_input_shape() + + def get_folded_output_shape(self): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + split_input = np.concatenate((values, values), axis=0) + return helper.make_node( + "Split", + inputs=[split_input], + outputs=[self.onnx_node.output[0], self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", data_type=TensorProto.FLOAT, axis=0 + ), + ) + + def infer_node_datatype(self, model): + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required GlobalAccPool_Batch attributes do not exist.""" + ) + + return info_messages + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_instream_width(self): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return 2 * np.prod(self.get_folded_output_shape()[1:-1]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_outputs(context, ["output0.npy", "output1.npy"]) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim \ + did not produce expected ofolded utput shape" + assert ( + context[node.output[1]].shape == folded_oshape + ), "cppsim \ + did not produce expected ofolded utput shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + context[node.output[1]] = context[node.output[1]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out0": [], "out1": []}, + } + self.rtlsim_multi_io(sim, rtlsim_dict) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_shape = self.get_folded_output_shape() + + out_npy_path = "{}/output0.npy".format(code_gen_dir) + rtlsim_output_to_npy( + rtlsim_dict["outputs"]["out0"], + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + ) + # load and reshape output 0 + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + out_npy_path = "{}/output1.npy".format(code_gen_dir) + rtlsim_output_to_npy( + rtlsim_dict["outputs"]["out1"], + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + ) + # load and reshape output 1 + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[1]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output0 shape doesn't match expected shape.""" + assert ( + context[node.output[1]].shape == exp_oshape + ), """Output1 shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out0 ("out0");'.format(self.get_outstream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out1 ("out1");'.format(self.get_outstream_width()) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format( + self.get_outstream_width(), self.get_number_output_values() // 2, + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output0.npy" % code_gen_dir + npy_out1 = "%s/output1.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out0, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + self.code_gen_dict["$DATAOUTSTREAM$"] += [ + 'apintstream2npy<%s, %s, %d, %s>(out1, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out1, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out0, + hls::stream<ap_uint<{}>> &out1)""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + self.get_outstream_width(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out0") + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out1") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py new file mode 100644 index 0000000000000000000000000000000000000000..fa321dfa65d14b67fa218fb6a49f602ddab8d57e --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/fmpadding.py @@ -0,0 +1,302 @@ +import os +import numpy as np +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class FMPadding_Batch(HLSCustomOp): + """Corresponds to finn-hlslib FMPadding_Batch function. + Pads input image by given amount.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + + def get_nodeattr_types(self): + my_attrs = { + # spatial size of input images + "ImgDim": ("i", True, 0), + # total padding (per dimension) to apply + "Padding": ("i", True, 2), + # number of channels in input image + "NumChannels": ("i", True, 0), + # FINN input datatype + "inputDataType": ("s", True, ""), + # controls distribution of padded pixels + # in case of uneven padding -- see FMPadding fxn + # in hlslib + "PaddingStyle": ("i", False, 2), + # shape describing input vecs per execution + "numInputVectors": ("i", False, 1), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_padded_odim(self): + "Return the padded spatial size of the output." + + idim = self.get_nodeattr("ImgDim") + pad = self.get_nodeattr("Padding") + return idim + pad + + def get_normal_input_shape(self): + idim = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + + ishape = (1, idim, idim, num_ch) + return ishape + + def get_normal_output_shape(self): + odim = self.get_padded_odim() + num_ch = self.get_nodeattr("NumChannels") + + oshape = (1, odim, odim, num_ch) + return oshape + + def get_folded_input_shape(self): + # even though there is no folding in the current hlslib op, + # insert a time multiplexing axis to remain compatible with the + # shapes produced by the rest of the dataflow pipeline + ret = list(self.get_normal_input_shape()) + ret.insert(-1, 1) + return tuple(ret) + + def get_folded_output_shape(self): + # even though there is no folding in the current hlslib op, + # insert a time multiplexing axis to remain compatible with the + # shapes produced by the rest of the dataflow pipeline + ret = list(self.get_normal_output_shape()) + ret.insert(-1, 1) + return tuple(ret) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for SameResize." + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = model.get_tensor_datatype(node.input[0]) + exp_idtype = self.get_input_datatype() + assert dtype == exp_idtype, "Unexpected datatype for FMPadding_Batch" + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + pass + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + # the hlslib op always pads with zeros, so ensure that the DataType + # is able to represent zeros + assert ret.allowed(0), "FMPadding_Batch DataType must support zero" + return ret + + def get_output_datatype(self): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self): + ibits = self.get_input_datatype().bitwidth() + num_ch = self.get_nodeattr("NumChannels") + + return ibits * num_ch + + def get_outstream_width(self): + obits = self.get_output_datatype().bitwidth() + num_ch = self.get_nodeattr("NumChannels") + + return obits * num_ch + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim1 {}\n#define OutputDim1 {}\n + #define Padding1 {}\n#define NumChannels1 {}\n + #define PaddingStyle1 {}\n#define numReps {}\n""".format( + self.get_nodeattr("ImgDim"), + self.get_padded_odim(), + self.get_nodeattr("Padding"), + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PaddingStyle"), + self.get_nodeattr("numInputVectors"), + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + in_t = self.get_input_datatype().get_hls_datatype_str() + node = self.onnx_node + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<ImgDim1, OutputDim1, Padding1, NumChannels1, + {}, PaddingStyle1> (in0, out, numReps);""".format( + node.op_type, in_t + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_hls_type, packed_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + + # no reshaping for input since assuming no folding on input + # make copy before saving array + inp = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), inp) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py index f650442401b49f1ad0a602b6b2ad3e50fbb5e5c2..9b73ba1e100aa83fd19aa8799195c99891fca3fd 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py @@ -513,40 +513,44 @@ class StreamingFCLayer_Batch(HLSCustomOp): elif mem_mode == "decoupled": """Saves weights in corresponding file format for cppsim or rtlsim""" # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) - # and save as unflipped weight tensor to be able to differentiate between - # flipped an unflipped weight tensor (has to be flipped for cppsim) - weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) - # flip PE dimension and reverse SIMD flip for saving weights in .npy - weight_tensor_flipped = np.flip(weight_tensor_unflipped, axis=-2) - weight_tensor_flipped = np.flip(weight_tensor_flipped, axis=-1) + # reverse SIMD flip for saving weights in .npy + weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) + # PE flip for saving weights in .dat + weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) - # reshape weight tensor (flipped and unflipped) to desired shape + # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") - # unflipped - weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd) - weight_tensor_unflipped = weight_tensor_unflipped.copy() + # simd_flipped + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() # flipped - weight_tensor_flipped = weight_tensor_flipped.reshape(1, -1, pe * simd) - weight_tensor_flipped = weight_tensor_flipped.copy() + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( + 1, -1, pe * simd + ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() """Saves weights into .npy file""" - np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped) + np.save( + os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped + ) """Saves weights into .dat file""" # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_unflipped = pack_innermost_dim_as_hex_string( - weight_tensor_unflipped, export_wdt, weight_width_padded, prefix="" + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" ) - weight_stream_len = np.prod(weight_tensor_unflipped.shape) + weight_stream_len = np.prod(weight_tensor_pe_flipped.shape) factor = math.ceil(weight_stream_len / 1024) # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_unflipped.flatten() + weight_stream = weight_tensor_pe_flipped.flatten() pad_amt = (factor * 1024) - weight_stream_len weight_stream = np.pad( weight_stream, (0, pad_amt), mode="constant", constant_values="0" diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..fa33c70218fab16f106da45e296f0d59ae4ea606 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -0,0 +1,551 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import ceil +import os + +import numpy as np + +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.custom_op.fpgadataflow import HLSCustomOp +from finn.util.basic import interleave_matrix_outer_dim_from_partitions +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + rtlsim_output_to_npy, +) +from . import templates + +# ONNX i/o tensor shape assumptions for Thresholding: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the threshold tensor, shape (NumChannels, n_thres) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +class Thresholding_Batch(HLSCustomOp): + """Class that corresponds to finn-hls Thresholding_Batch function.""" + + def __init__(self, onnx_node): + super().__init__(onnx_node) + self.decoupled_wrapper = templates.decoupled_wrapper + + def get_nodeattr_types(self): + my_attrs = { + "PE": ("i", True, 0), + "NumChannels": ("i", True, 0), + # string defining memory type + "ram_style": ("s", False, "distributed"), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # input and output FIFO depths + "inFIFODepth": ("i", False, 0), + "outFIFODepth": ("i", False, 0), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM.""" + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return mh // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + # implement tensor with correct shape + values = np.random.randn(*oshape).astype(np.float32) + return helper.make_node( + "Constant", + inputs=[], + outputs=[self.onnx_node.output[0]], + value=helper.make_tensor( + name="const_tensor", + data_type=TensorProto.FLOAT, + dims=values.shape, + vals=values.flatten().astype(float), + ), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt_name = self.get_input_datatype().name + exp_idt_name = self.get_nodeattr("inputDataType") + assert exp_idt_name == idt_name, "Bad input DataType for Thresholding layer" + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "domain" is set to "finn" + domain_value = self.onnx_node.domain + if domain_value == "finn": + info_messages.append("Attribute domain is set correctly") + else: + info_messages.append('Attribute domain should be set to "finn"') + + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append( + """The required Threshold_Batch attributes do not exist.""" + ) + + return info_messages + + def bram_estimation(self): + """Calculates BRAM cost if resource set to BRAM""" + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + + if style == "block" and tmem > 1: + return int(ceil(A * P / 16)) * int(ceil(tmem / 1024)) + else: + return 0 + + def lut_estimation(self): + """Calculates LUT cost, taking memory resource type into account """ + # TODO add in/out FIFO contributions + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + # cost of comparators + comparator_cost = A * P + # cost of LUTRAM + if style == "distributed" and tmem > 1: + lutram_cost = P * A * int(ceil(tmem / 64)) + else: + lutram_cost = 0 + # total cost + return comparator_cost + lutram_cost + + def get_input_datatype(self): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + # fill in TSrcI + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ( + ret.shape[0] == mh + ), "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def generate_params(self, model, path): + code_gen_dir = path + # save thresholds in thresh.h + thresholds = model.get_initializer(self.onnx_node.input[1]) + + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + tdt = DataType.INT32 + thresholds_hls_code = numpy_to_hls_code( + threshold_tensor, tdt, "thresholds", False, True + ) + # write thresholds into thresh.h + f_thresh = open("{}/thresh.h".format(code_gen_dir), "w") + tdt_hls = tdt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType.BIPOLAR: + export_odt = DataType.BINARY + odt_hls = export_odt.get_hls_datatype_str() + f_thresh.write( + "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + threshold_tensor.shape[-1], + tdt_hls, + odt_hls, + export_odt.min(), + "std::less_equal<%s>" % tdt_hls, + ) + ) + f_thresh.write(thresholds_hls_code) + f_thresh.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType.BIPOLAR: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType.BINARY + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for StreamingFCLayer") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType.BIPOLAR: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_folded_output_shape() + ), """Output shape is not as expected""" + # reshape output to have expected shape + oshape = self.get_normal_output_shape() + context[node.output[0]] = context[node.output[0]].reshape(*oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + # TODO check and add whatever missing + def defines(self, var): + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = numInputVectors[0] + self.code_gen_dict["$DEFINES$"] = [ + """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format( + self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + tmpl_args = self.get_template_param_values() + # TODO: why put some template parameters into defines and not others? + # should ImgDim be defined or just filled in here like we do now? + node = self.onnx_node + ishape = self.get_folded_input_shape() + if len(ishape) == 3: + imgdim = 1 + elif len(ishape) == 5: + imgdim = ishape[1] + else: + raise Exception("""Unexpeted input shape""") + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, NumChannels1, PE1, {}, {}> + (in0, out, threshs, numReps);""".format( + node.op_type, imgdim, tmpl_args["TSrcI"], tmpl_args["TDstI"], + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType.BIPOLAR: + # use binary for bipolar storage + dtype = DataType.BINARY + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream<ap_uint<{}>> &in0, + hls::stream<ap_uint<{}>> &out + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.get_outstream_width(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"] + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out") + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=1" + ) + ) + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " + "complete dim=3" + ) + ) + # set resource type + ram_style = self.get_nodeattr("ram_style") + pe = self.get_nodeattr("PE") + ich = self.get_nodeattr("NumChannels") + # if PE less than NumChannels, assign cores according to ram_style; + # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs + if pe < ich: + if ram_style == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.m_thresholds " + "core=ROM_2P_LUTRAM" + ) + ) + elif ram_style == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ( + "#pragma HLS RESOURCE variable=threshs.m_thresholds " + "core=ROM_2P_BRAM" + ) + ) + else: + raise Exception( + """Invalid value for attribute ram_style! Is currently set to: {} + has to be set to one of ("block", "distributed")""".format( + ram_style + ) + ) diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py index 16446c15d46ee7996162f864708f7fde6cfedaf3..82a6b140f7af1be4e5c0f429d077b99c7865383e 100644 --- a/src/finn/custom_op/im2col.py +++ b/src/finn/custom_op/im2col.py @@ -21,8 +21,6 @@ def get_im2col_indices_nchw( """Returns im2col indices.""" # First figure out what the size of the output should be N, C, H, W = x_shape - assert (H + 2 * padding - field_height) % stride_y == 0 - assert (W + 2 * padding - field_width) % stride_x == 0 out_height = compute_conv_output_dim(H, field_height, stride_y, padding) out_width = compute_conv_output_dim(W, field_width, stride_x, padding) @@ -70,6 +68,9 @@ def im2col_indices_nchw( # * ifm is the number of input channels # * k is the convolutional kernel size +# note: for the innermost (dot product) dimension of k*k*ifm, we +# assume an internal ordering (k, k, ifm) + class Im2Col(CustomOp): def get_nodeattr_types(self): diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py index 2dbff507c7b84a430da0d45563af6ab99f1971e8..2dae826cf9712bef17d0053a0878c41ef51fec36 100644 --- a/src/finn/custom_op/registry.py +++ b/src/finn/custom_op/registry.py @@ -44,9 +44,12 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( StreamingDataWidthConverter_Batch, ) from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch +from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch +from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.quantavgpool2d import QuantAvgPool2d +from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch # create a mapping of all known CustomOp names and classes custom_op = {} @@ -63,9 +66,12 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingFIFO"] = StreamingFIFO custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch +custom_op["FMPadding_Batch"] = FMPadding_Batch +custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["QuantAvgPool2d"] = QuantAvgPool2d +custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch def getCustomOp(node): diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index dbd98623c4cdf5baca9fa9c137debf8be0f70981..d421a5f3ef8ca980b399087de1482b2ae913da1b 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -26,13 +26,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from onnx import helper +from onnx import helper, TensorProto from finn.core.datatype import DataType from finn.transformation import Transformation from finn.custom_op.registry import getCustomOp from finn.transformation.infer_shapes import InferShapes from finn.transformation.infer_datatypes import InferDataTypes +import finn.core.data_layout as DataLayout class InferConvInpGen(Transformation): @@ -58,27 +59,61 @@ class InferConvInpGen(Transformation): ifm_ch = i2c_in_shape[-1] ifm_dim = i2c_in_shape[1] ofm_dim = i2c_out_shape[1] - # if padding enabled, ensure pad_val supported by DataType + + # default params for ConvolutionInputGenerator + ConvInpGen_node_idx = node_ind + ConvInpGen_input = i2c_input + ConvInpGen_idim = ifm_dim + if pad > 0: + # if padding enabled, ensure pad_val supported by DataType assert dt.allowed(pad_val), "Im2Col DataType must support pad_val" + + odim_padding = ifm_dim + 2 * pad + + padding_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, odim_padding, odim_padding, ifm_ch), + ) + graph.value_info.append(padding_out) + padding_out = padding_out.name + model.set_tensor_datatype(padding_out, dt) + + ConvInpGen_node_idx += 1 + ConvInpGen_input = padding_out + ConvInpGen_idim = odim_padding + + padding_node = helper.make_node( + "FMPadding_Batch", + [i2c_input], + [padding_out], + domain="finn", + backend="fpgadataflow", + ImgDim=ifm_dim, + Padding=2 * pad, + NumChannels=ifm_ch, + inputDataType=dt.name, + ) + graph.node.insert(node_ind, padding_node) + # create equivalent ConvolutionInputGenerator node - # TODO support padding - new_node = helper.make_node( + ConvInpGen_node = helper.make_node( "ConvolutionInputGenerator", - [i2c_input], + [ConvInpGen_input], [i2c_output], domain="finn", backend="fpgadataflow", ConvKernelDim=k, IFMChannels=ifm_ch, - IFMDim=ifm_dim, + IFMDim=ConvInpGen_idim, OFMDim=ofm_dim, SIMD=ifm_ch, Stride=stride, inputDataType=dt.name, outputDataType=dt.name, ) - graph.node.insert(node_ind, new_node) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -398,3 +433,59 @@ class InferQuantizedStreamingFCLayer(Transformation): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferThresholdingLayer(Transformation): + """Convert any MultiThreshold into a standalone thresholding HLS layer.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "MultiThreshold": + thl_input = node.input[0] + thl_threshold = node.input[1] + thl_output = node.output[0] + thl_in_shape = model.get_tensor_shape(thl_input) + idt = model.get_tensor_datatype(thl_input) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # skip conversion if input is not NHWC or NC + thl_in_layout = model.get_tensor_layout(thl_input) + if thl_in_layout != DataLayout.NHWC and thl_in_layout != DataLayout.NC: + continue + + # now safe to assume number of channels is in last dimension + ifc = int(thl_in_shape[-1]) + # create node with no parallelization first + pe = 1 + assert ifc % pe == 0, "Requirement IFC divisable by PE is violated." + + odt = model.get_tensor_datatype(thl_output) + # create and insert new StreamingFCLayer node + new_node = helper.make_node( + "Thresholding_Batch", + [thl_input, thl_threshold], + [thl_output], + domain="finn", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + inputDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 049ede5064d252bd6391184c4227e5367a8c1e2b..18d3db18da089a5dda4dbb6d97180dd4a20613b5 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -107,6 +107,13 @@ class MakePYNQDriver(Transformation): driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) + # clock settings for driver + clk_ns = float(model.get_metadata_prop("clk_ns")) + fclk_mhz = 1 / (clk_ns * 0.001) + # TODO change according to PYNQ board? + driver = driver.replace("$CLK_NAME$", "fclk0_mhz") + driver = driver.replace("$CLOCK_FREQ_MHZ$", str(fclk_mhz)) + with open(driver_py, "w") as f: f.write(driver) # copy all the dependencies into the driver folder diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index 55ecb57decd2ac4fa08331b5ebbcb7fd2f0cd5c6..ab9fd03251819aee72f74cc0c1fa17b99b1e05a4 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -91,7 +91,7 @@ cd %s pynq_driver_template = """ import argparse - +import os from pynq import Overlay import numpy as np from pynq import allocate @@ -101,6 +101,7 @@ from finn.util.data_packing import ( packed_bytearray_to_finnpy ) from finn.core.datatype import DataType +from pynq.ps import Clocks class FINNAccelDriver(): def __init__(self, N, bitfile): @@ -118,8 +119,12 @@ class FINNAccelDriver(): self.oshape_folded = $OUTPUT_SHAPE_FOLDED$ self.ishape_packed = $INPUT_SHAPE_PACKED$ # datatype np.uint8 self.oshape_packed = $OUTPUT_SHAPE_PACKED$ # datatype np.uint8 + # clock frequency + self.fclk_mhz = $CLOCK_FREQ_MHZ$ # load bitfile and set up accelerator self.ol = Overlay(bitfile) + # set the clock frequency as specified by user during transformations + Clocks.$CLK_NAME$ = self.fclk_mhz self.dma = self.ol.axi_dma_0 self.ctrl_regs = self.ol.resize_accel_0 # neuron folding factor of output = iterations per sample @@ -202,6 +207,12 @@ if __name__ == "__main__": # for the remote execution the data from the input npy file has to be loaded, # packed and copied to the PYNQ buffer if exec_mode == "execute": + # remove old output file to prevent reusing old output + # in case execution fails + try: + os.remove(outputfile) + except FileNotFoundError: + pass # load desired input .npy file ibuf_normal = np.load(inputfile) ibuf_folded = finnDriver.fold_input(ibuf_normal) @@ -212,10 +223,15 @@ if __name__ == "__main__": # for the throughput test the runtime of the network has to be measured if exec_mode == "throughput_test": - # measure runtime of network - start = time.time() + # remove old metrics file + try: + os.remove("nw_metrics.txt") + except FileNotFoundError: + pass # dictionary for results of throughput test res={} + # measure runtime of network + start = time.time() # execute accelerator finnDriver.execute() @@ -228,6 +244,8 @@ if __name__ == "__main__": res["throughput[images/s]"] = N / runtime res["DRAM_in_bandwidth[Mb/s]"] = np.prod(finnDriver.ishape_packed)*0.000001 / runtime res["DRAM_out_bandwidth[Mb/s]"] = np.prod(finnDriver.oshape_packed)*0.000001 / runtime + res["fclk[mhz]"] = Clocks.fclk0_mhz + res["N"] = N file = open("nw_metrics.txt", "w") file.write(str(res)) file.close() diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py index 53c73e1dc4fe0bfab53e3f126add992cb338c11d..488391740fc25f1f7caa657adc9ed55bdc2f9722 100644 --- a/src/finn/transformation/general.py +++ b/src/finn/transformation/general.py @@ -28,6 +28,7 @@ import finn.util.basic as util from finn.transformation import Transformation +from toposort import toposort_flatten class GiveUniqueNodeNames(Transformation): @@ -81,6 +82,93 @@ class GiveReadableTensorNames(Transformation): return (model, False) +class GiveUniqueParameterTensors(Transformation): + """Make every parameter tensor unique. The aim is to avoid affecting + other nodes apart from the one the system is currently operating on.""" + + def apply(self, model): + graph = model.graph + graph_modified = False + seen_parameters = [] + for n in graph.node: + # copy inputs since they may be modified + node_inputs_list = [x for x in n.input] + for input_idx, node_input in enumerate(node_inputs_list): + # check if it's a parameter + input_init = model.get_initializer(node_input) + if input_init is None: + # dynamic input + continue + + # check if repeated + if node_input not in seen_parameters: + # first occurance + seen_parameters += [node_input] + continue + + new_param_name = model.make_new_valueinfo_name() + + model.set_initializer(new_param_name, input_init) + model.set_tensor_datatype( + new_param_name, model.get_tensor_datatype(node_input) + ) + + # point node input to new tensor + n.input[input_idx] = new_param_name + + return (model, graph_modified) + + +class SortGraph(Transformation): + """ Returns the model with its node list sorted topologically. + Any ONNX graph to be executed must have a topologically sorted node list, as dictated + by the ONNX standard. + """ + + # Notes on SortGraph performance: + # benchmark in tests/transformation/test_sort_graph.py + # + # The algorithm doesn't move initializers so its performance should only depend on + # the number of nodes + # + # Relative order of magnitudes for time per step: + # - Gather graph structure: base + # - Sort nodes: 0.1 of base + # - Remove and insert in order : 0.001 of base + # + # Notes: + # Remove nodes and insert them in order: + # Probably this is faster than copying initializers and more robust in general + + def apply(self, model): + # Gather graph structure + graph_dependencies = {} + node_list = [ + n for n in model.graph.node + ] # I also need the list to remove the nodes + for node_idx, n in enumerate(node_list): + node_pred = model.find_direct_predecessors(n) + if node_pred is None: + # Will also eliminate nodes that are floating around for some reason + continue + + node_dependencies = [node_list.index(pred) for pred in node_pred] + graph_dependencies[node_idx] = set(node_dependencies) + + # Sort nodes + sorted_node_indexes = toposort_flatten(graph_dependencies) + + # Remove nodes and insert them in order + # Can't remove nodes before if I want to use model.find_direct_predecessors() + for n in node_list: + model.graph.node.remove(n) + + for new_idx, sorted_idx in enumerate(sorted_node_indexes): + model.graph.node.insert(new_idx, node_list[sorted_idx]) + + return model, False + + class ConvertSubToAdd(Transformation): """Convert subtract-a-constant nodes to add-a-constant nodes.""" diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index 0d709297a9132b15b51435b7ab4b51ce55c7e9f3..dbcf97361017144174f9fbfca35a84361b5abd26 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -46,7 +46,11 @@ class AbsorbAddIntoMultiThreshold(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "Add": + if ( + n.op_type == "Add" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): consumer = model.find_consumer(n.output[0]) if consumer is not None and consumer.op_type == "MultiThreshold": add_weight_name = n.input[1] @@ -83,7 +87,11 @@ class AbsorbMulIntoMultiThreshold(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "Mul": + if ( + n.op_type == "Mul" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): mul_weight_name = n.input[1] A = model.get_initializer(mul_weight_name) assert A is not None, "Initializer for mul weights is not set." diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py index aa059747b602bc6b659bc8b53b1f18988bba1ef0..67824ad4f633983b93e3178d03118927a1ddd85b 100644 --- a/src/finn/transformation/streamline/collapse_repeated.py +++ b/src/finn/transformation/streamline/collapse_repeated.py @@ -48,9 +48,17 @@ class CollapseRepeatedOp(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == self.op_name: + if ( + n.op_type == self.op_name + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): consumer = model.find_consumer(n.output[0]) - if consumer is not None and consumer.op_type == self.op_name: + if ( + consumer is not None + and consumer.op_type == self.op_name + and not model.is_join_node(consumer) + ): op0_param_name = n.input[1] op1_param_name = consumer.input[1] op0_param = model.get_initializer(op0_param_name) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 1886c785705161c3a13493de44dc3f3f86463f4f..0b6259a61d3eb67b7b38d4c6939019ce2893a875 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -36,8 +36,9 @@ from finn.util.basic import get_by_name class MoveAddPastMul(Transformation): - """Move add operations past multiply operations. The aim is to have them - next to each other such that they can be collapsed into a single add.""" + """Move add operations past multiply operations on linear segments of the graph. + The aim is to have them next to each other such that they can be collapsed into + a single add.""" def apply(self, model): graph = model.graph @@ -45,9 +46,17 @@ class MoveAddPastMul(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "Add": + if ( + n.op_type == "Add" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): consumer = model.find_consumer(n.output[0]) - if consumer is not None and consumer.op_type == "Mul": + if ( + consumer is not None + and consumer.op_type == "Mul" + and not model.is_join_node(consumer) + ): # have: (x) -> add(,B) -> (x+B) -> mul(,A) -> (xA+BA) # want: (x) -> mul(,A) -> (xA) -> add(,BA) -> (xA+BA) # assume input 0 is from the previous layer, input 1 is the @@ -63,12 +72,16 @@ class MoveAddPastMul(Transformation): end_name = consumer.output[0] # compute new param value for add BA = B * A + # make and insert new nodes new_mul = oh.make_node( - "Mul", [start_name, mul_weight_name], [middle_name] + "Mul", + [start_name, mul_weight_name], + [middle_name], + name=consumer.name, ) new_add = oh.make_node( - "Add", [middle_name, add_weight_name], [end_name] + "Add", [middle_name, add_weight_name], [end_name], name=n.name ) graph.node.insert(node_ind, new_mul) graph.node.insert(node_ind + 1, new_add) @@ -78,6 +91,7 @@ class MoveAddPastMul(Transformation): graph.node.remove(n) graph.node.remove(consumer) graph_modified = True + model = model.transform(InferShapes()) return (model, graph_modified) @@ -92,9 +106,17 @@ class MoveScalarMulPastMatMul(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "Mul": + if ( + n.op_type == "Mul" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): consumer = model.find_consumer(n.output[0]) - if consumer is not None and consumer.op_type == "MatMul": + if ( + consumer is not None + and consumer.op_type == "MatMul" + and not model.is_join_node(consumer) + ): mul_weight_name = n.input[1] matmul_weight_name = consumer.input[1] A = model.get_initializer(mul_weight_name) @@ -109,10 +131,16 @@ class MoveScalarMulPastMatMul(Transformation): # if the mul is scalar, we can simply swap the order of ops # make and insert new nodes new_matmul = oh.make_node( - "MatMul", [start_name, matmul_weight_name], [middle_name] + "MatMul", + [start_name, matmul_weight_name], + [middle_name], + name=consumer.name, ) new_mul = oh.make_node( - "Mul", [middle_name, mul_weight_name], [end_name] + "Mul", + [middle_name, mul_weight_name], + [end_name], + name=n.name, ) graph.node.insert(node_ind, new_matmul) graph.node.insert(node_ind + 1, new_mul) @@ -135,9 +163,17 @@ class MoveScalarAddPastMatMul(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "Add": + if ( + n.op_type == "Add" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): consumer = model.find_consumer(n.output[0]) - if consumer is not None and consumer.op_type == "MatMul": + if ( + consumer is not None + and consumer.op_type == "MatMul" + and not model.is_join_node(consumer) + ): add_weight_name = n.input[1] matmul_weight_name = consumer.input[1] A = model.get_initializer(add_weight_name) @@ -155,10 +191,16 @@ class MoveScalarAddPastMatMul(Transformation): # update the add weight model.set_initializer(add_weight_name, Anew) new_matmul = oh.make_node( - "MatMul", [start_name, matmul_weight_name], [middle_name] + "MatMul", + [start_name, matmul_weight_name], + [middle_name], + name=consumer.name, ) new_add = oh.make_node( - "Add", [middle_name, add_weight_name], [end_name] + "Add", + [middle_name, add_weight_name], + [end_name], + name=n.name, ) graph.node.insert(node_ind, new_matmul) graph.node.insert(node_ind + 1, new_add) @@ -181,9 +223,17 @@ class MoveScalarAddPastConv(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "Add": + if ( + n.op_type == "Add" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): consumer = model.find_consumer(n.output[0]) - if consumer is not None and consumer.op_type == "Conv": + if ( + consumer is not None + and consumer.op_type == "Conv" + and not model.is_join_node(consumer) + ): conv_node = consumer add_node = n add_weight_name = n.input[1] @@ -194,7 +244,12 @@ class MoveScalarAddPastConv(Transformation): start_name = n.input[0] end_name = consumer.output[0] conv_out_shape = model.get_tensor_shape(end_name) - if all(x == 1 for x in A.shape): + + using_padding = True + pads = list(get_by_name(consumer.attribute, "pads").ints) + if sum(pads) == 0: + using_padding = False + if all(x == 1 for x in A.shape) and not using_padding: # create a tensor filled with the add constant, in # the shape expected by the convolution conv_in_const = np.zeros(conv_in_shape, dtype=np.float32) @@ -206,7 +261,8 @@ class MoveScalarAddPastConv(Transformation): execute_node(conv_node, exec_ctx, model.graph) # retrieve the conv output Anew = exec_ctx[end_name] - # strip out repetition + + # strip out repetition if no padding Anew = Anew[0, :, 0, 0].reshape(1, -1, 1, 1) # update the add weight model.set_initializer(add_weight_name, Anew) @@ -224,6 +280,7 @@ class MoveScalarAddPastConv(Transformation): graph.node.remove(add_node) graph.node.insert(node_ind, add_node) graph_modified = True + model = model.transform(InferShapes()) return (model, graph_modified) @@ -238,9 +295,17 @@ class MoveScalarMulPastConv(Transformation): graph_modified = False for n in graph.node: node_ind += 1 - if n.op_type == "Mul": + if ( + n.op_type == "Mul" + and not model.is_fork_node(n) + and not model.is_join_node(n) + ): consumer = model.find_consumer(n.output[0]) - if consumer is not None and consumer.op_type == "Conv": + if ( + consumer is not None + and consumer.op_type == "Conv" + and not model.is_join_node(consumer) + ): mul_weight_name = n.input[1] A = model.get_initializer(mul_weight_name) assert A is not None, "Initializer for mul weights is not set." @@ -379,3 +444,90 @@ class MakeMaxPoolNHWC(Transformation): graph.node.insert(node_ind - 1, consumer) graph_modified = True return (model, graph_modified) + + +class MoveOpPastFork(Transformation): + """Move node operations past graph forks. Used when a node before a fork + can be merged with nodes in the branches + """ + + def __init__(self, op_name_list): + super().__init__() + self.ops_to_move = op_name_list + + def apply(self, model): + graph = model.graph + graph_modified = False + nodes = [n for n in graph.node] + node_ind = 0 + for n in nodes: + node_ind += 1 + if ( + n.op_type in self.ops_to_move + and model.is_fork_node(n) + and not model.is_join_node(n) + ): + + # Restrict this transform to operations with constant parameters + # Assuming parameters is in input 1 + op_init_param = model.get_initializer(n.input[1]) + if op_init_param is None: + continue + + # Check case when branches are empty and go + # to the same node + consumers = model.find_consumers(n.output[0]) + unique_consumer = True + for consum_node in consumers[1:]: + if consumers[0] != consum_node: + unique_consumer = False + break + + if unique_consumer: + continue + + for consumer_node in consumers[1:]: + # create new node + new_param_name = model.make_new_valueinfo_name() + new_output_tensor_name = model.make_new_valueinfo_name() + new_node = oh.make_node( + n.op_type, + [n.input[0], new_param_name], + [new_output_tensor_name], + ) + graph.node.insert(node_ind, new_node) + node_ind += 1 + model.set_initializer(new_param_name, op_init_param) + + # change consumer input tensor + graph.node.remove(consumer_node) + for idx, consumer_input in enumerate(consumer_node.input): + if consumer_input == n.output[0]: + consumer_node.input[idx] = new_output_tensor_name + break + else: + raise Exception( + "Consumer should have the current node output as input" + ) + + graph.node.insert(node_ind, consumer_node) + + graph_modified = True + + model = model.transform(InferShapes()) + return (model, graph_modified) + + +class MoveAddPastFork(MoveOpPastFork): + def __init__(self): + super().__init__(["Add"]) + + +class MoveMulPastFork(MoveOpPastFork): + def __init__(self): + super().__init__(["Mul"]) + + +class MoveLinearPastFork(MoveOpPastFork): + def __init__(self): + super().__init__(["Add", "Mul"]) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index bc413bf665e96be1d58a5de13b0744fd6a80f855..3880bb9591e27af5fe9d063dba2485d304e4db54 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -43,6 +43,13 @@ pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1" pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1" pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e" +# native AXI HP port width (in bits) for PYNQ boards +pynq_native_port_width = dict() +pynq_native_port_width["Pynq-Z1"] = 64 +pynq_native_port_width["Pynq-Z2"] = 64 +pynq_native_port_width["Ultra96"] = 128 +pynq_native_port_width["ZCU104"] = 128 + def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index 9a2708439c0fed1e25c0d955af21cd2e9e705446..d1669444e55cb0fddb2690e51849c4603d47d32c 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -83,14 +83,27 @@ def pyverilate_stitched_ip(model): def file_to_dir(x): return os.path.dirname(os.path.realpath(x)) + def file_to_basename(x): + return os.path.basename(os.path.realpath(x)) + all_verilog_dirs = list(map(file_to_dir, all_verilog_srcs)) - top_verilog = model.get_metadata_prop("wrapper_filename") + all_verilog_files = list( + set( + filter( + lambda x: x.endswith(".v"), + list(map(file_to_basename, all_verilog_srcs)), + ) + ) + ) + top_module_name = model.get_metadata_prop("wrapper_filename") + top_module_name = file_to_basename(top_module_name).strip(".v") build_dir = make_build_dir("pyverilator_ipstitched_") sim = PyVerilator.build( - top_verilog, + all_verilog_files, verilog_path=all_verilog_dirs, build_dir=build_dir, trace_depth=get_rtlsim_trace_depth(), + top_module_name=top_module_name, ) return sim @@ -114,3 +127,91 @@ def is_fpgadataflow_node(node): is_node = True return is_node + + +def rtlsim_multi_io(sim, io_dict, num_out_values, trace_file=""): + """Runs the pyverilator simulation by passing the input values to the simulation, + toggle the clock and observing the execution time. Function contains also an + observation loop that can abort the simulation if no output value is produced + after a set number of cycles. Can handle multiple i/o streams. See function + implementation for details on how the top-level signals should be named. + + sim: the PyVerilator object for simulation + io_dict: a dict of dicts in the following format: + {"inputs" : {"in0" : <input_data>, "in1" : <input_data>}, + "outputs" : {"out0" : [], "out1" : []} } + <input_data> is a list of Python arbitrary-precision ints indicating + what data to push into the simulation, and the output lists are + similarly filled when the simulation is complete + num_out_values: number of total values to be read from the simulation to + finish the simulation and return. + + returns: number of clock cycles elapsed for completion + + """ + + if trace_file != "": + sim.start_vcd_trace(trace_file) + + for outp in io_dict["outputs"]: + sim.io[outp + "_V_V_TREADY"] = 1 + + # observe if output is completely calculated + # total_cycle_count will contain the number of cycles the calculation ran + output_done = False + total_cycle_count = 0 + output_count = 0 + old_output_count = 0 + + # avoid infinite looping of simulation by aborting when there is no change in + # output values after 100 cycles + no_change_count = 0 + liveness_threshold = pyverilate_get_liveness_threshold_cycles() + + while not (output_done): + for inp in io_dict["inputs"]: + inputs = io_dict["inputs"][inp] + sim.io[inp + "_V_V_TVALID"] = 1 if len(inputs) > 0 else 0 + sim.io[inp + "_V_V_TDATA"] = inputs[0] if len(inputs) > 0 else 0 + if sim.io[inp + "_V_V_TREADY"] == 1 and sim.io[inp + "_V_V_TVALID"] == 1: + inputs = inputs[1:] + io_dict["inputs"][inp] = inputs + + for outp in io_dict["outputs"]: + outputs = io_dict["outputs"][outp] + if sim.io[outp + "_V_V_TVALID"] == 1 and sim.io[outp + "_V_V_TREADY"] == 1: + outputs = outputs + [sim.io[outp + "_V_V_TDATA"]] + output_count += 1 + io_dict["outputs"][outp] = outputs + + sim.io.ap_clk = 1 + sim.io.ap_clk = 0 + + total_cycle_count = total_cycle_count + 1 + + if output_count == old_output_count: + no_change_count = no_change_count + 1 + else: + no_change_count = 0 + old_output_count = output_count + + # check if all expected output words received + if output_count == num_out_values: + output_done = True + + # end sim on timeout + if no_change_count == liveness_threshold: + if trace_file != "": + sim.flush_vcd_trace() + sim.stop_vcd_trace() + raise Exception( + "Error in simulation! Takes too long to produce output. " + "Consider setting the LIVENESS_THRESHOLD env.var. to a " + "larger value." + ) + + if trace_file != "": + sim.flush_vcd_trace() + sim.stop_vcd_trace() + + return total_cycle_count diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py index d1da6934a5db07aabe41a9ca40b5de497b6460a1..5fa9b23bad5c5b67f65530c55f862f889c07b1ac 100644 --- a/tests/core/test_modelwrapper.py +++ b/tests/core/test_modelwrapper.py @@ -73,6 +73,11 @@ def test_modelwrapper(): inp_layout = DataLayout.NCHW model.set_tensor_layout(inp_name, inp_layout) assert model.get_tensor_layout(inp_name) == inp_layout + inp_sparsity = model.get_tensor_sparsity(inp_name) + assert inp_sparsity is None + inp_sparsity = {"dw": {"kernel_shape": 3}} + model.set_tensor_sparsity(inp_name, inp_sparsity) + assert model.get_tensor_sparsity(inp_name) == inp_sparsity os.remove(export_onnx_path) @@ -127,3 +132,45 @@ def test_modelwrapper_graph_order(): assert model.get_node_index(Round_node) == 1 assert model.get_node_index(Ceil_node) == 2 assert model.get_node_index(Add_node) == 3 + + +def test_modelwrapper_detect_forks_n_joins(): + # create small network with properties to be tested + Neg_node = onnx.helper.make_node("Neg", inputs=["in1"], outputs=["neg1"]) + Round_node = onnx.helper.make_node("Round", inputs=["neg1"], outputs=["round1"]) + + Ceil_node = onnx.helper.make_node("Ceil", inputs=["neg1"], outputs=["ceil1"]) + Add_node = onnx.helper.make_node( + "Add", inputs=["round1", "ceil1"], outputs=["out1"] + ) + + in1 = onnx.helper.make_tensor_value_info("in1", onnx.TensorProto.FLOAT, [4, 4]) + out1 = onnx.helper.make_tensor_value_info("out1", onnx.TensorProto.FLOAT, [4, 4]) + + graph = onnx.helper.make_graph( + nodes=[Neg_node, Round_node, Ceil_node, Add_node], + name="simple_graph", + inputs=[in1], + outputs=[out1], + value_info=[ + onnx.helper.make_tensor_value_info("neg1", onnx.TensorProto.FLOAT, [4, 4]), + onnx.helper.make_tensor_value_info( + "round1", onnx.TensorProto.FLOAT, [4, 4] + ), + onnx.helper.make_tensor_value_info("ceil1", onnx.TensorProto.FLOAT, [4, 4]), + ], + ) + + onnx_model = onnx.helper.make_model(graph, producer_name="simple-model") + model = ModelWrapper(onnx_model) + + # test + assert model.is_fork_node(Neg_node) + assert not model.is_fork_node(Round_node) + assert not model.is_fork_node(Ceil_node) + assert not model.is_fork_node(Add_node) + + assert not model.is_join_node(Neg_node) + assert not model.is_join_node(Round_node) + assert not model.is_join_node(Ceil_node) + assert model.is_join_node(Add_node) diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py index e6d1fc4efd61c01654ee88638698215d23a82eb3..c3359dcc82650bf0e9e8a5bc5276f5ca770ee96c 100644 --- a/tests/end2end/test_end2end_cnv_w1a1.py +++ b/tests/end2end/test_end2end_cnv_w1a1.py @@ -76,7 +76,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") test_fpga_part = pynq_part_map[test_pynq_board] -target_clk_ns = 5 +target_clk_ns = 10 mem_mode = "decoupled" diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1.py similarity index 98% rename from tests/end2end/test_end2end_tfc_w1a1_throughput_test.py rename to tests/end2end/test_end2end_tfc_w1a1.py index 1ba149687bb80a0f977115bd380a09f70eef23f1..15c1c41b006c6f87d79a0e7eb6a4458838de5fd2 100644 --- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py +++ b/tests/end2end/test_end2end_tfc_w1a1.py @@ -41,7 +41,6 @@ import onnx.numpy_helper as nph import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls import finn.transformation.streamline.absorb as absorb from finn.core.onnx_exec import execute_onnx -from finn.core.throughput_test import throughput_test from finn.custom_op.registry import getCustomOp from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount from finn.transformation.fold_constants import FoldConstants @@ -332,9 +331,6 @@ def test_end2end_tfc_w1a1_run_on_pynq(): ret = execute_onnx(parent_model, {iname: x}, True) y = ret[oname] assert np.isclose(y, y_golden).all() - child_model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) - res = throughput_test(child_model) - assert res is not None except KeyError: pytest.skip("PYNQ board IP address not specified") diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..ee65326ec57fb7fa7fa0490a8980dbabb8efc13c --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py @@ -0,0 +1,106 @@ +from onnx import TensorProto, helper +import numpy as np +import pytest + +from finn.core.datatype import DataType +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_datatypes import InferDataTypes +from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from finn.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul + +import finn.core.onnx_exec as oxe +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import gen_finn_dt_tensor +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls + +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +@pytest.mark.parametrize("padding", [True, False]) +@pytest.mark.parametrize("kernel_size", [3, 5]) +@pytest.mark.slow +@pytest.mark.vivado +def test_convert_to_hls_conv_layer(padding, kernel_size): + + assert ( + kernel_size % 2 != 0 + ), """test_convert_to_hls_conv_layer test only + supports odd kernel_size""" + + np.random.seed(0) + padding = True + idt = DataType.UINT4 + + in_feature_dim = 7 + in_chn = 3 + + stages = 1 # just one convolution + + out_feature_dim = ( + in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages + ) + + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, in_chn, out_feature_dim, out_feature_dim] + + conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = 1 + conv_config["kernel_shape"] = [kernel_size, kernel_size] + if padding: + pad = kernel_size // 2 + conv_config["pads"] = [pad, pad, pad, pad] + else: + conv_config["pads"] = [0, 0, 0, 0] + conv_config["strides"] = [1, 1] + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + value_info = [ + helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape) + ] + + modelproto = helper.make_model( + helper.make_graph( + name="conv_test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config) + ], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", idt) + model.set_tensor_datatype("top_out", idt) + model.set_tensor_datatype("p1", DataType.UINT4) + + model = model.transform(InferShapes()) + model.set_initializer( + "p1", np.round(np.random.rand(*conv_param_shape).astype(np.float32) * 16) + ) + + model.set_tensor_datatype(model.graph.input[0].name, idt) + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + + new_model = model.transform(LowerConvsToMatMul()) + new_model = new_model.transform(to_hls.InferConvInpGen()) + + new_model = new_model.transform(PrepareCppSim()) + new_model = new_model.transform(CompileCppSim()) + new_model = new_model.transform(SetExecMode("cppsim")) + + x = gen_finn_dt_tensor(idt, input_shape) + inp_dict = {model.graph.input[0].name: x} + assert oxe.compare_execution(model, new_model, inp_dict) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py index e03090f0581eebf68cac7baffb6888a6992df68d..48803c9614f53a3a149c6eaac4289d10086513a5 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py @@ -39,6 +39,7 @@ from finn.core.modelwrapper import ModelWrapper from finn.transformation.fold_constants import FoldConstants from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames from finn.transformation.infer_shapes import InferShapes +from finn.transformation.infer_data_layouts import InferDataLayouts from finn.transformation.streamline import Streamline from finn.util.test import get_test_model_trained from finn.transformation.double_to_single_float import DoubleToSingleFloat @@ -54,7 +55,9 @@ export_onnx_path_cnv = "test_output_cnv.onnx" @pytest.mark.vivado -def test_convert_to_hls_layers_cnv_w1a1(): +# Standalone or fused thresholding-based activation +@pytest.mark.parametrize("fused_activation", [True, False]) +def test_convert_to_hls_layers_cnv_w1a1(fused_activation): cnv = get_test_model_trained("CNV", 1, 1) bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv) model = ModelWrapper(export_onnx_path_cnv) @@ -69,6 +72,7 @@ def test_convert_to_hls_layers_cnv_w1a1(): model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) + model = model.transform(InferDataLayouts()) # model.save("golden.onnx") # load one of the test vectors fn = pk.resource_filename("finn", "data/cifar10/cifar10-test-data-class3.npz") @@ -80,6 +84,10 @@ def test_convert_to_hls_layers_cnv_w1a1(): expected_ctx = oxe.execute_onnx(model, input_dict, True) expected = expected_ctx[model.graph.output[0].name] + # if we infer thresholding first, all MultiThresholds get converted to HLS + # subsequently, the FC inference will generate passthrough MVAUs + if not fused_activation: + model = model.transform(to_hls.InferThresholdingLayer()) model = model.transform(to_hls.InferBinaryStreamingFCLayer()) model = model.transform(to_hls.InferQuantizedStreamingFCLayer()) for node in model.graph.node: @@ -102,7 +110,12 @@ def test_convert_to_hls_layers_cnv_w1a1(): model = model.transform(to_hls.InferStreamingMaxPool()) # check topology status finn_nodes = model.get_finn_nodes() - assert len(finn_nodes) == 18 + if fused_activation: + assert len(finn_nodes) == 18 + else: + assert len(finn_nodes) == 26 + thr_nodes = model.get_nodes_by_op_type("Thresholding_Batch") + assert len(thr_nodes) == 8 non_finn_nodes = model.get_non_finn_nodes() assert len(non_finn_nodes) == 4 exp_non_finn_nodes = ["Transpose", "Reshape", "Mul", "Add"] diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 5051bf34dc690daf8b6186859d3717cc8e217eee..b5fc85caf274edc9e7afc52df962862fa8a99ba3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -78,7 +78,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0 ): odt = idt inp = helper.make_tensor_value_info( @@ -102,6 +102,7 @@ def make_single_slidingwindow_modelwrapper( Stride=stride, inputDataType=idt.name, outputDataType=odt.name, + depthwise=dw, ) graph = helper.make_graph( nodes=[SlidingWindow_node], @@ -126,25 +127,29 @@ def prepare_inputs(input_tensor): # input datatype @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2]) # kernel size -@pytest.mark.parametrize("k", [2, 4]) +@pytest.mark.parametrize("k", [2, 3]) # input dimension -@pytest.mark.parametrize("ifm_dim", [4, 6, 8]) +@pytest.mark.parametrize("ifm_dim", [6, 8]) # input channels -@pytest.mark.parametrize("ifm_ch", [2, 4]) # , 2, 3, 4]) +@pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride @pytest.mark.parametrize("stride", [1, 2]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") @pytest.mark.parametrize("simd", [1, 2]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd): +def test_fpgadataflow_slidingwindow( + idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw +): ofm_dim = int(((ifm_dim - k) / stride) + 1) x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) model = make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt + k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw ) if exec_mode == "cppsim": @@ -168,6 +173,12 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt ) y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - # if idt == DataType.BIPOLAR: - # y_expected = 2 * y_expected - 1 - assert (y_produced == y_expected).all() + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape( + 1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd + ) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) + assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py new file mode 100644 index 0000000000000000000000000000000000000000..4fb84be59333ef0e696204c9064fcf77e35b5d9b --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -0,0 +1,127 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) + + +def make_dupstreams_modelwrapper(ch, pe, idim, idt): + shape = [1, idim, idim, ch] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp0 = helper.make_tensor_value_info("outp0", TensorProto.FLOAT, shape) + outp1 = helper.make_tensor_value_info("outp1", TensorProto.FLOAT, shape) + + dupstrm_node = helper.make_node( + "DuplicateStreams_Batch", + ["inp"], + ["outp0", "outp1"], + domain="finn", + backend="fpgadataflow", + NumChannels=ch, + PE=pe, + inputDataType=idt.name, + numInputVectors=[1, idim, idim], + ) + graph = helper.make_graph( + nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=[outp0, outp1] + ) + + model = helper.make_model(graph, producer_name="addstreams-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + + return model + + +def prepare_inputs(input_tensor, idt): + return {"inp": input_tensor} + + +# data type +@pytest.mark.parametrize("idt", [DataType.INT4, DataType.UINT16]) +# channels +@pytest.mark.parametrize("ch", [64]) +# folding +@pytest.mark.parametrize("fold", [-1, 2, 1]) +# image dimension +@pytest.mark.parametrize("imdim", [7]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.vivado +def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode): + if fold == -1: + pe = 1 + else: + pe = ch // fold + assert ch % pe == 0 + + # generate input data + x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) + + model = make_dupstreams_modelwrapper(ch, pe, imdim, idt) + + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # prepare input data and execute + input_dict = prepare_inputs(x, idt) + output_dict = oxe.execute_onnx(model, input_dict) + y0 = output_dict["outp0"] + y1 = output_dict["outp1"] + expected_y = x + + assert (y0 == expected_y).all(), exec_mode + " failed" + assert (y1 == expected_y).all(), exec_mode + " failed" diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py new file mode 100644 index 0000000000000000000000000000000000000000..9d6390b2673e5d2c0e72748183ac04ed222d078e --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -0,0 +1,121 @@ +import pytest +import os +import numpy as np + +from onnx import TensorProto, helper +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.util.basic import gen_finn_dt_tensor +import finn.core.onnx_exec as oxe +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim + +from finn.util.basic import pynq_part_map + +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 + + +def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style): + assert pad_style == 2, "only pad_style == 2 supported in hlslib" + assert padding > 0, "Output dim should be greater than input dim" + odim = idim + padding + + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, [1, idim, idim, num_ch] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, odim, odim, num_ch] + ) + + FMPadding = helper.make_node( + "FMPadding_Batch", + ["inp"], + ["outp"], + domain="finn", + backend="fpgadataflow", + ImgDim=idim, + Padding=padding, + NumChannels=num_ch, + inputDataType=str(idt.name), + PaddingStyle=pad_style, + numInputVectors=1, + ) + + graph = helper.make_graph( + nodes=[FMPadding], name="fmpadding_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="fmpadding-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + + return model + + +# input image dimension +@pytest.mark.parametrize("idim", [8, 16]) +# number of rows and number of cols to add +@pytest.mark.parametrize("pad", [2, 3]) +# number of channels +@pytest.mark.parametrize("num_ch", [1, 2]) +# PaddingStyle: selects behavior when (odim-idim)%2 != 0 +@pytest.mark.parametrize("pad_style", [2]) +# FINN input datatype +@pytest.mark.parametrize("idt", [DataType.INT2, DataType.INT4]) +# execution mode +@pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode): + + # generate input data + x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch]) + input_dict = {"inp": x} + odim = idim + pad + + model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style) + model = model.transform(InferShapes()) + model = model.transform(SetExecMode(mode)) + model = model.transform(GiveUniqueNodeNames()) + if mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif mode == "rtlsim": + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + expected_oshape = (1, odim, odim, num_ch) + assert y_produced.shape == expected_oshape + + # calculate reference + # calculate correct pad according to parameters + if pad_style == 2: + if pad % 2 == 0: + pad_up = pad // 2 + pad_left = pad // 2 + else: + pad_up = pad // 2 + 1 + pad_left = pad // 2 + 1 + else: + pad_up = pad // 2 + pad_left = pad // 2 + + pad_down = pad - pad_up + pad_right = pad - pad_left + + y_expected = np.pad( + x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant" + ) + + assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py new file mode 100644 index 0000000000000000000000000000000000000000..50b990f13494f22e985406791445b406e9946147 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.custom_op.multithreshold import multithreshold +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.general import GiveUniqueNodeNames +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.util.basic import gen_finn_dt_tensor +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) + + +def make_single_thresholding_modelwrapper(T, pe, idt, odt): + NumChannels = T.shape[0] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, NumChannels]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, NumChannels]) + + node_inp_list = ["inp", "thresh"] + + Thresholding_node = helper.make_node( + "Thresholding_Batch", + node_inp_list, + ["outp"], + domain="finn", + backend="fpgadataflow", + NumChannels=NumChannels, + PE=pe, + inputDataType=idt.name, + outputDataType=odt.name, + ) + graph = helper.make_graph( + nodes=[Thresholding_node], + name="thresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="thresholding-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + model.set_tensor_datatype("thresh", idt) + model.set_initializer("thresh", T) + return model + + +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType.INT4, DataType.BIPOLAR]) +# input datatype +@pytest.mark.parametrize("idt", [DataType.INT16, DataType.UINT16]) +# folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2, 1]) +# number of input features +@pytest.mark.parametrize("ich", [16]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode): + if nf == -1: + nf = ich + pe = ich // nf + assert ich % pe == 0 + + # generate input data + x = gen_finn_dt_tensor(idt, (1, ich)) + + odt = act + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + + model = make_single_thresholding_modelwrapper(T, pe, idt, odt) + + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + # package input data as dictionary + input_dict = {"inp": x} + + y = multithreshold(x, T) + if act == DataType.BIPOLAR: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim failed" + + if exec_mode == "rtlsim": + hls_synt_res_est = model.analysis(hls_synth_res_estimation) + assert "Thresholding_Batch_0" in hls_synt_res_est diff --git a/tests/pynq/test_pynq_performance_end2end.py b/tests/pynq/test_pynq_performance_end2end.py new file mode 100644 index 0000000000000000000000000000000000000000..66a93a190061e0142637be19bb2ea841d192745a --- /dev/null +++ b/tests/pynq/test_pynq_performance_end2end.py @@ -0,0 +1,65 @@ +import os + +import pytest +import numpy as np +from scipy.stats import linregress +import warnings +from finn.util.test import load_test_checkpoint_or_skip +from finn.core.throughput_test import throughput_test + +build_dir = "/tmp/" + os.environ["FINN_INST_NAME"] + + +@pytest.mark.parametrize("end2end_example", ["tfc_w1a1", "cnv_w1a1"]) +@pytest.mark.slow +def test_pynq_performance_end2end(end2end_example): + model = load_test_checkpoint_or_skip( + build_dir + "/end2end_%s_pynq_deploy.onnx" % end2end_example + ) + try: + ip = os.environ["PYNQ_IP"] # NOQA + board = os.environ["PYNQ_BOARD"] # NOQA + if ip == "" or board == "": + pytest.skip("PYNQ board or IP address not specified") + ret = dict() + # try a range of batch sizes, some may fail due to insufficient DMA + # buffers + bsize_range_in = [2 ** i for i in range(16)] + bsize_range = [] + for bsize in bsize_range_in: + res = throughput_test(model, bsize) + if res is not None: + ret[bsize] = res + bsize_range.append(bsize) + else: + # assume we reached largest possible N + break + + y = [ret[key]["runtime[ms]"] for key in bsize_range] + lrret = linregress(bsize_range, y) + ret_str = "" + ret_str += "\n" + "%s Throughput Test Results" % end2end_example + ret_str += "\n" + "-----------------------------" + ret_str += "\n" + "From linear regression:" + ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept + ret_str += "\n" + "Time per sample: %f ms" % lrret.slope + ret_str += "\n" + "Raw data:" + + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]" + ) + for k in bsize_range: + v = ret[k] + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + k, + np.round(v["runtime[ms]"], 4), + v["fclk[mhz]"], + np.round(v["throughput[images/s]"], 2), + np.round(v["DRAM_in_bandwidth[Mb/s]"], 2), + np.round(v["DRAM_out_bandwidth[Mb/s]"], 2), + ) + ret_str += "\n" + "-----------------------------" + warnings.warn(ret_str) + + except KeyError: + pytest.skip("PYNQ board or IP address not specified") diff --git a/tests/pynq/test_pynq_performance_fifo.py b/tests/pynq/test_pynq_performance_fifo.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4542473c4b58d3baa62f4123fd0f2f76954d95 --- /dev/null +++ b/tests/pynq/test_pynq_performance_fifo.py @@ -0,0 +1,128 @@ +import os + +import pytest + +import numpy as np +from onnx import TensorProto, helper + +from finn.core.datatype import DataType +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker +from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ +from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject +from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject +import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp +from finn.transformation.general import GiveUniqueNodeNames +from finn.util.basic import pynq_part_map, pynq_native_port_width +from finn.core.throughput_test import throughput_test +from scipy.stats import linregress +import warnings + + +def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype): + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape) + + FIFO_node = helper.make_node( + "StreamingFIFO", + ["inp"], + ["outp"], + domain="finn", + backend="fpgadataflow", + depth=Depth, + folded_shape=fld_shape, + dataType=str(finn_dtype.name), + ) + + graph = helper.make_graph( + nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="fifo-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", finn_dtype) + model.set_tensor_datatype("outp", finn_dtype) + + return model + + +@pytest.mark.vivado +@pytest.mark.slow +def test_pynq_performance_fifo(): + try: + ip = os.environ["PYNQ_IP"] # NOQA + board = os.environ["PYNQ_BOARD"] # NOQA + if ip == "" or board == "": + pytest.skip("PYNQ board or IP address not specified") + fifo_width = pynq_native_port_width[board] + shape = (1, fifo_width) + folded_shape = (1, 1, fifo_width) + depth = 16 + clk_ns = 10 + dtype = DataType.BIPOLAR + fpga_part = pynq_part_map[board] + username = os.getenv("PYNQ_USERNAME", "xilinx") + password = os.getenv("PYNQ_PASSWORD", "xilinx") + port = os.getenv("PYNQ_PORT", 22) + target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") + + model = make_single_fifo_modelwrapper(shape, depth, folded_shape, dtype) + model = model.transform(InsertTLastMarker()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(fpga_part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(rvp.ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP(fpga_part, clk_ns)) + model = model.transform(MakePYNQProject(board)) + model = model.transform(SynthPYNQProject()) + model = model.transform(MakePYNQDriver()) + model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir)) + + ret = dict() + # try a range of batch sizes, some may fail due to insufficient DMA + # buffers + bsize_range_in = [2 ** i for i in range(20)] + bsize_range = [] + for bsize in bsize_range_in: + res = throughput_test(model, bsize) + if res is not None: + ret[bsize] = res + bsize_range.append(bsize) + else: + # assume we reached largest possible N + break + + y = [ret[key]["runtime[ms]"] for key in bsize_range] + lrret = linregress(bsize_range, y) + ret_str = "" + ret_str += "\n" + "FIFO Throughput Test Results" + ret_str += "\n" + "-----------------------------" + ret_str += "\n" + "From linear regression:" + ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept + ret_str += "\n" + "Time per sample: %f ms" % lrret.slope + ret_str += "\n" + "Raw data:" + + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[Mb/s]", "DRAM wr[Mb/s]" + ) + for k in bsize_range: + v = ret[k] + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + k, + np.round(v["runtime[ms]"], 4), + v["fclk[mhz]"], + np.round(v["throughput[images/s]"], 2), + np.round(v["DRAM_in_bandwidth[Mb/s]"], 2), + np.round(v["DRAM_out_bandwidth[Mb/s]"], 2), + ) + ret_str += "\n" + "-----------------------------" + warnings.warn(ret_str) + + except KeyError: + pytest.skip("PYNQ board or IP address not specified") diff --git a/tests/transformation/test_collapse_repeated_op.py b/tests/transformation/test_collapse_repeated_op.py index 01d932ece0be4b0beb7ad6094284ec3efb1e525e..b74d868f9b921c35ff9f596c811583f45f761374 100644 --- a/tests/transformation/test_collapse_repeated_op.py +++ b/tests/transformation/test_collapse_repeated_op.py @@ -34,6 +34,7 @@ import finn.core.onnx_exec as ox from finn.core.modelwrapper import ModelWrapper from finn.transformation.infer_shapes import InferShapes from finn.transformation.streamline import CollapseRepeatedAdd, CollapseRepeatedMul +import pytest def test_collapse_repeated_op(): @@ -67,3 +68,60 @@ def test_collapse_repeated_op(): new_model = new_model.transform(CollapseRepeatedMul()) inp_dict = {"top_in": np.asarray([-1.0, 1.0], dtype=np.float32)} assert ox.compare_execution(model, new_model, inp_dict) + assert len(new_model.graph.node) == 2 + assert new_model.graph.node[0].op_type == "Add" + assert new_model.graph.node[1].op_type == "Mul" + + +@pytest.mark.parametrize( + "test_args", [("Add", CollapseRepeatedAdd()), ("Mul", CollapseRepeatedMul())], +) +def test_collapse_repeated_only_if_linear(test_args): + scalar_op = test_args[0] + transf_fxn = test_args[1] + + input_shape = [4, 4] + output_shape = input_shape + + top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + + value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, [1])] + + modelproto = oh.make_model( + oh.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + oh.make_node(scalar_op, ["top_in", "p2"], ["t1"]), + oh.make_node(scalar_op, ["t1", "p1"], ["t2"]), + oh.make_node(scalar_op, ["t2", "p3"], ["t3"]), + oh.make_node(scalar_op, ["t2", "p4"], ["t4"]), + oh.make_node(scalar_op, ["t3", "t4"], ["t5"]), + oh.make_node(scalar_op, ["t5", "p5"], ["top_out"]), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + model.set_initializer("p1", *np.random.rand(1).astype(np.float32)) + model.set_initializer("p2", *np.random.rand(1).astype(np.float32)) + model.set_initializer("p3", *np.random.rand(1).astype(np.float32)) + model.set_initializer("p4", *np.random.rand(1).astype(np.float32)) + model.set_initializer("p5", *np.random.rand(1).astype(np.float32)) + + # Transform + new_model = model.transform(transf_fxn) + + # Test + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + assert ox.compare_execution(model, new_model, inp_dict) + assert len(new_model.graph.node) == 5 diff --git a/tests/transformation/test_general_transformation.py b/tests/transformation/test_general_transformation.py index 33b6041a170f3c0de8f741ef3ecb28682f6429ea..153af378eb3e07d5824f114fd194730048fb4953 100644 --- a/tests/transformation/test_general_transformation.py +++ b/tests/transformation/test_general_transformation.py @@ -31,6 +31,12 @@ from pkgutil import get_data from finn.core.modelwrapper import ModelWrapper from finn.transformation.general import GiveUniqueNodeNames +import numpy as np +import onnx +import finn.core.onnx_exec as oxe +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.general import GiveUniqueParameterTensors + def test_give_unique_node_names(): raw_m = get_data("finn", "data/onnx/mnist-conv/model.onnx") @@ -39,3 +45,76 @@ def test_give_unique_node_names(): assert model.graph.node[0].name == "Reshape_0" assert model.graph.node[1].name == "Conv_0" assert model.graph.node[11].name == "Add_2" + + +def test_give_unique_parameter_tensors(): + + # Create model + input_shape = [4, 4] + in1 = onnx.helper.make_tensor_value_info("in1", onnx.TensorProto.FLOAT, input_shape) + out1 = onnx.helper.make_tensor_value_info( + "out1", onnx.TensorProto.FLOAT, input_shape + ) + + graph_nodes = [] + graph_nodes += [ + onnx.helper.make_node("Add", inputs=["in1", "param1"], outputs=["t1"]) + ] + + graph_nodes += [ + onnx.helper.make_node("Sum", inputs=["t1", "param1", "param1"], outputs=["t2"]) + ] + + graph_nodes += [ + onnx.helper.make_node("Sum", inputs=["t2", "param2", "param1"], outputs=["t3"]) + ] + + graph_nodes += [ + onnx.helper.make_node("Add", inputs=["t3", "param1"], outputs=["out1"]) + ] + + onnx_graph = onnx.helper.make_graph( + nodes=graph_nodes, name="simple_graph", inputs=[in1], outputs=[out1], + ) + + onnx_model = onnx.helper.make_model(onnx_graph, producer_name="simple-model") + model = ModelWrapper(onnx_model) + + # Set param values + np.random.seed(0) + param1 = np.random.rand(*input_shape).astype(np.float32) + param2 = np.random.rand(*input_shape).astype(np.float32) + model.set_initializer("param1", param1) + model.set_initializer("param2", param2) + model = model.transform(InferShapes()) + + # Apply transformation + new_model = model.transform(GiveUniqueParameterTensors()) + new_model = new_model.transform(InferShapes()) + + # Test + # Breaks the model? + input_tensor = np.random.rand(*input_shape).astype(np.float32) + input_dict = {"in1": input_tensor} + + # run original + expected_context = oxe.execute_onnx(model, input_dict) + expected_output = expected_context[model.graph.output[0].name] + + # run modified + produced_context = oxe.execute_onnx(new_model, input_dict) + produced_output = produced_context[new_model.graph.output[0].name] + + assert np.isclose( + expected_output, produced_output, atol=1e-8 + ).all(), " GiveUniqueParameterTensors() transform breaks the model" + + # Does the job? + param_set = set() + param_cnt = 0 + for n in new_model.graph.node: + for i in range(1, len(n.input)): + param_set |= {n.input[i]} + param_cnt += 1 + + assert len(param_set) == param_cnt, " There are still parameters reused" diff --git a/tests/transformation/test_move_add_past_mul.py b/tests/transformation/test_move_add_past_mul.py index a0516d6fb2ff985fc112185ce99ad8facd841caf..163b9d310a5f12bd0b854f9aa46f53a549bf109e 100644 --- a/tests/transformation/test_move_add_past_mul.py +++ b/tests/transformation/test_move_add_past_mul.py @@ -60,6 +60,9 @@ def test_move_add_past_mul_single(): new_model = model.transform(MoveAddPastMul()) inp_dict = {"top_in": np.asarray([-1.0, 1.0], dtype=np.float32)} assert ox.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "Mul" + assert new_model.graph.node[1].op_type == "Add" + assert new_model.graph.node[0].output[0] == new_model.graph.node[1].input[0] def test_move_add_past_mul_multi(): @@ -92,3 +95,50 @@ def test_move_add_past_mul_multi(): new_model = model.transform(MoveAddPastMul()) inp_dict = {"top_in": np.asarray([-1.0, 1.0], dtype=np.float32)} assert ox.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "Mul" + assert new_model.graph.node[1].op_type == "Mul" + assert new_model.graph.node[2].op_type == "Add" + assert new_model.graph.node[3].op_type == "Add" + for i in range(len(new_model.graph.node) - 1): + assert new_model.graph.node[i].output[0] == new_model.graph.node[i + 1].input[0] + + +def test_move_add_past_mul_only_if_linear(): + top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [2]) + top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2]) + + value_info = [oh.make_tensor_value_info("add1_param", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("mul1_param", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("mul2_param", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("mul3_param", TensorProto.FLOAT, [1])] + modelproto = oh.make_model( + oh.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + oh.make_node("Add", ["top_in", "add1_param"], ["t1"]), + oh.make_node("Mul", ["t1", "mul1_param"], ["fork"]), + oh.make_node("Mul", ["fork", "mul2_param"], ["t3"]), + oh.make_node("Add", ["t3", "fork"], ["t4"]), + oh.make_node("Mul", ["t4", "mul3_param"], ["top_out"]), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + model.set_initializer("add1_param", np.random.rand(2).astype(np.float32)) + model.set_initializer("mul1_param", np.random.rand(2).astype(np.float32)) + model.set_initializer("mul2_param", np.random.rand(2).astype(np.float32)) + model.set_initializer("mul3_param", np.random.rand(2).astype(np.float32)) + new_model = model.transform(MoveAddPastMul()) + inp_dict = {"top_in": np.random.rand(2).astype(np.float32)} + assert ox.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "Mul" + assert new_model.graph.node[1].op_type == "Add" + assert new_model.graph.node[2].op_type == "Mul" + assert new_model.graph.node[3].op_type == "Add" + assert new_model.graph.node[4].op_type == "Mul" diff --git a/tests/transformation/test_move_past_fork.py b/tests/transformation/test_move_past_fork.py new file mode 100644 index 0000000000000000000000000000000000000000..f3d37bd60c9e2580ca4499daafa8693f39fec810 --- /dev/null +++ b/tests/transformation/test_move_past_fork.py @@ -0,0 +1,79 @@ +from onnx import TensorProto, helper +import numpy as np + +import finn.core.onnx_exec as oxe +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.streamline.reorder import MoveLinearPastFork +from finn.transformation.infer_shapes import InferShapes + +import pytest + + +@pytest.mark.parametrize("ch", [64, 1]) +# ifmdim +@pytest.mark.parametrize("ifmdim", [-1, 7]) +def test_move_past_fork(ch, ifmdim): + # generate test vectors of correct shape + if ifmdim == -1: + input_shape = (1, ch) + else: + input_shape = (1, ch, ifmdim, ifmdim) + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape) + + num_of_params = 8 + value_info = [] + for i in range(num_of_params): + value_info += [ + helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) + ] + + add_1_to_move = helper.make_node("Add", ["top_in", "p0"], ["fork1"]) + mul_1_to_move = helper.make_node("Mul", ["t5", "p4"], ["fork2"]) + add_2_to_move = helper.make_node("Add", ["fork2", "p5"], ["t6"]) + mul_1_not_to_move = helper.make_node("Mul", ["t8", "p7"], ["fork3"]) + modelproto = helper.make_model( + helper.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + # fork1 + add_1_to_move, + helper.make_node("Mul", ["fork1", "p1"], ["t2"]), + helper.make_node("Mul", ["fork1", "p2"], ["t3"]), + helper.make_node("Add", ["t2", "t3"], ["t4"]), + helper.make_node("Add", ["t4", "p3"], ["t5"]), + # fork2 + mul_1_to_move, + add_2_to_move, + helper.make_node("Add", ["fork2", "p6"], ["t7"]), + helper.make_node("Add", ["t6", "t7"], ["t8"]), + # empty branches: do nothing + mul_1_not_to_move, + helper.make_node("Add", ["fork3", "fork3"], ["top_out"]), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + for i in range(num_of_params): + model.set_initializer( + "p" + str(i), np.random.rand(*input_shape).astype(np.float32) + ) + + # Transform + new_model = model.transform(MoveLinearPastFork()) + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + + # Test + assert oxe.compare_execution(model, new_model, inp_dict) + assert not new_model.is_fork_node(add_1_to_move) + assert not new_model.is_fork_node(mul_1_to_move) + assert not new_model.is_fork_node(add_2_to_move) + assert new_model.is_fork_node(mul_1_not_to_move) + assert len(new_model.graph.node) == 14 diff --git a/tests/transformation/test_move_scalar_past_conv.py b/tests/transformation/test_move_scalar_past_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..0f50642d2b9d1583030630cb4927c2b86667e71a --- /dev/null +++ b/tests/transformation/test_move_scalar_past_conv.py @@ -0,0 +1,166 @@ +import numpy as np +import onnx.helper as oh +import pytest +from onnx import TensorProto + +import finn.core.onnx_exec as ox +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.infer_shapes import InferShapes +from finn.transformation.streamline import ( + MoveScalarAddPastConv, + MoveScalarMulPastConv, +) + + +@pytest.mark.parametrize("padding", [False, True]) +@pytest.mark.parametrize( + "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())], +) +def test_move_scalar_past_conv(test_args, padding): + scalar_op = test_args[0] + transf_fxn = test_args[1] + + in_feature_dim = 7 + in_chn = 3 + + stages = 2 + kernel_size = 3 + + out_feature_dim = ( + in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages + ) + + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, in_chn, out_feature_dim, out_feature_dim] + + conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = 1 + conv_config["kernel_shape"] = [kernel_size, kernel_size] + if padding: + conv_config["pads"] = [1, 1, 1, 1] + else: + conv_config["pads"] = [0, 0, 0, 0] + conv_config["strides"] = [1, 1] + + top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + + value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)] + value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, conv_param_shape)] + + modelproto = oh.make_model( + oh.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + oh.make_node(scalar_op, ["top_in", "p1"], ["t1"]), + oh.make_node("Conv", ["t1", "p2"], ["t2"], **conv_config), + oh.make_node("Conv", ["t2", "p3"], ["top_out"], **conv_config), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + model.set_initializer("p1", *np.random.rand(1).astype(np.float32)) + model.set_initializer("p2", np.random.rand(*conv_param_shape).astype(np.float32)) + model.set_initializer("p3", np.random.rand(*conv_param_shape).astype(np.float32)) + new_model = model.transform(transf_fxn) + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + + assert ox.compare_execution(model, new_model, inp_dict) + if scalar_op == "Add": + if padding: + assert new_model.graph.node[0].op_type == scalar_op + assert new_model.graph.node[1].op_type == "Conv" + assert new_model.graph.node[2].op_type == "Conv" + else: + assert new_model.graph.node[0].op_type == "Conv" + assert new_model.graph.node[1].op_type == scalar_op + assert new_model.graph.node[2].op_type == "Conv" + else: + assert new_model.graph.node[0].op_type == "Conv" + assert new_model.graph.node[1].op_type == "Conv" + assert new_model.graph.node[2].op_type == scalar_op + + +@pytest.mark.parametrize( + "test_args", [("Add", MoveScalarAddPastConv()), ("Mul", MoveScalarMulPastConv())], +) +def test_move_scalar_past_conv_only_if_linear(test_args): + scalar_op = test_args[0] + transf_fxn = test_args[1] + + in_feature_dim = 7 + in_chn = 1 + padding = False + stages = 3 + kernel_size = 3 + + out_feature_dim = ( + in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages + ) + + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, in_chn, out_feature_dim, out_feature_dim] + + conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = 1 + conv_config["kernel_shape"] = [kernel_size, kernel_size] + conv_config["pads"] = [0, 0, 0, 0] + conv_config["strides"] = [1, 1] + + top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + + value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)] + value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, conv_param_shape)] + value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, conv_param_shape)] + value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, conv_param_shape)] + + modelproto = oh.make_model( + oh.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + oh.make_node("Conv", ["top_in", "p2"], ["t1"], **conv_config), + oh.make_node(scalar_op, ["t1", "p1"], ["t2"]), + oh.make_node("Conv", ["t2", "p3"], ["t3"], **conv_config), + oh.make_node("Conv", ["t2", "p4"], ["t4"], **conv_config), + oh.make_node(scalar_op, ["t3", "t4"], ["t5"]), + oh.make_node("Conv", ["t5", "p5"], ["top_out"], **conv_config), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + model.set_initializer("p1", *np.random.rand(1).astype(np.float32)) + model.set_initializer("p2", np.random.rand(*conv_param_shape).astype(np.float32)) + model.set_initializer("p3", np.random.rand(*conv_param_shape).astype(np.float32)) + model.set_initializer("p4", np.random.rand(*conv_param_shape).astype(np.float32)) + model.set_initializer("p5", np.random.rand(*conv_param_shape).astype(np.float32)) + new_model = model.transform(transf_fxn) + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + + assert ox.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "Conv" + assert new_model.graph.node[1].op_type == scalar_op + assert new_model.graph.node[2].op_type == "Conv" + assert new_model.graph.node[3].op_type == "Conv" + assert new_model.graph.node[4].op_type == scalar_op + assert new_model.graph.node[5].op_type == "Conv" diff --git a/tests/transformation/test_move_scalar_past_matmul.py b/tests/transformation/test_move_scalar_past_matmul.py index 896527e82d8cfa869cb979d1102904c70703a14c..e432dbf4ec1a38551609e5914e2d44968a020908 100644 --- a/tests/transformation/test_move_scalar_past_matmul.py +++ b/tests/transformation/test_move_scalar_past_matmul.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +import pytest import onnx.helper as oh from onnx import TensorProto @@ -99,3 +100,56 @@ def test_move_scalar_add_past_matmul(): assert new_model.graph.node[0].op_type == "MatMul" assert new_model.graph.node[1].op_type == "Add" assert new_model.graph.node[0].output[0] == new_model.graph.node[1].input[0] + + +@pytest.mark.parametrize( + "test_args", + [("Add", MoveScalarAddPastMatMul()), ("Mul", MoveScalarMulPastMatMul())], +) +def test_move_scalar_past_matmul_only_if_linear(test_args): + scalar_op = test_args[0] + transf_fxn = test_args[1] + input_shape = [1, 2] + matmul_shape = [2, 2] + top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape) + + p1 = oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1, 1]) + p2 = oh.make_tensor_value_info("p2", TensorProto.FLOAT, matmul_shape) + p3 = oh.make_tensor_value_info("p3", TensorProto.FLOAT, matmul_shape) + p4 = oh.make_tensor_value_info("p4", TensorProto.FLOAT, matmul_shape) + modelproto = oh.make_model( + oh.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=[p1, p2, p3, p4], + nodes=[ + oh.make_node(scalar_op, ["top_in", "p1"], ["t1"]), + oh.make_node("MatMul", ["t1", "p2"], ["fork"]), + oh.make_node("MatMul", ["fork", "p3"], ["t3"]), + oh.make_node(scalar_op, ["t3", "fork"], ["t4"]), + oh.make_node("MatMul", ["t4", "p4"], ["top_out"]), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + model.set_initializer("p1", np.random.rand(1, 1).astype(np.float32)) + model.set_initializer("p2", np.random.rand(*matmul_shape).astype(np.float32)) + model.set_initializer("p3", np.random.rand(*matmul_shape).astype(np.float32)) + model.set_initializer("p4", np.random.rand(*matmul_shape).astype(np.float32)) + + # Transform + new_model = model.transform(transf_fxn) + + # Test + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + assert ox.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "MatMul" + assert new_model.graph.node[1].op_type == scalar_op + assert new_model.graph.node[2].op_type == "MatMul" + assert new_model.graph.node[3].op_type == scalar_op + assert new_model.graph.node[4].op_type == "MatMul" diff --git a/tests/transformation/test_sort_graph.py b/tests/transformation/test_sort_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..05842504c13b144bb34e8084fb12b5086fa84115 --- /dev/null +++ b/tests/transformation/test_sort_graph.py @@ -0,0 +1,150 @@ +from onnx import TensorProto, helper +import numpy as np + +from finn.core.modelwrapper import ModelWrapper +from finn.transformation.general import SortGraph +from finn.transformation.infer_shapes import InferShapes +import pytest +import finn.analysis.topology as ta + + +def make_randomly_sorted_linear_model(num_of_nodes, seed=None): + if seed is not None: + np.random.seed(seed) + + ch = 2 + ifmdim = 16 + input_shape = (1, ch, ifmdim, ifmdim) + + top_in = helper.make_tensor_value_info("t0", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info( + "t" + str(num_of_nodes), TensorProto.FLOAT, input_shape + ) + + value_info = [] + nodes = [] + for i in range(num_of_nodes): + nodes += [ + helper.make_node("Add", ["t" + str(i), "p" + str(i)], ["t" + str(i + 1)]) + ] + value_info += [ + helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) + ] + + nodes = np.random.permutation(nodes) + + modelproto = helper.make_model( + helper.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=nodes, + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + for i in range(num_of_nodes): + model.set_initializer( + "p" + str(i), np.random.rand(*input_shape).astype(np.float32) + ) + + return model + + +@pytest.mark.parametrize("num_of_nodes", [64]) +def test_sort_linear_graph(num_of_nodes): + model = make_randomly_sorted_linear_model(num_of_nodes, seed=0) + new_model = model.transform(SortGraph()) + + # Test + ret = new_model.analysis(ta.nodes_topologically_sorted) + assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted." + + +def test_sort_nonlinear_graph(): + ch = 2 + ifmdim = 16 + input_shape = (1, ch, ifmdim, ifmdim) + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape) + + num_of_params = 8 + value_info = [] + for i in range(num_of_params): + value_info += [ + helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) + ] + + modelproto = helper.make_model( + helper.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + # Not sorted nodes + helper.make_node("Mul", ["fork1", "p2"], ["t3"]), + helper.make_node("Add", ["t4", "p3"], ["t5"]), + helper.make_node("Add", ["t2", "t3"], ["t4"]), + helper.make_node("Add", ["t6", "t7"], ["t8"]), + helper.make_node("Add", ["fork3", "fork3"], ["top_out"]), + helper.make_node("Mul", ["t5", "p4"], ["fork2"]), + helper.make_node("Add", ["top_in", "p0"], ["fork1"]), + helper.make_node("Mul", ["fork1", "p1"], ["t2"]), + helper.make_node("Add", ["fork2", "p5"], ["t6"]), + helper.make_node("Add", ["fork2", "p6"], ["t7"]), + helper.make_node("Mul", ["t8", "p7"], ["fork3"]), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + for i in range(num_of_params): + model.set_initializer( + "p" + str(i), np.random.rand(*input_shape).astype(np.float32) + ) + + new_model = model.transform(SortGraph()) + + # Test + ret = new_model.analysis(ta.nodes_topologically_sorted) + assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted." + + +if __name__ == "__main__": + import time + + sizes = [10, 50, 100, 500, 1000] + times = [] + reps = 10 + + print("SortGraph performance test:") + print("Test sizes", sizes) + print("Repetitions per size:", reps) + for sz in sizes: + acc_time = 0 + print(" Testing size ", sz) + for i in range(reps): + # it should take the same time even with the sorted one + # but better new model each time as it is a more general approach + model = make_randomly_sorted_linear_model(sz) # new model as seed is None + bef = time.time() + new_model = model.transform(SortGraph(), make_deepcopy=False) + acc_time += time.time() - bef + + times += [acc_time / reps] + + # print csv + print("\nnum_of_nodes, seconds") + for sz, tm in zip(sizes, times): + print("{:12d}, {:6.4e}".format(sz, tm)) + + # plot + # import matplotlib.pyplot as plt + # plt.plot(sizes,times,"--o") + # plt.grid(True)