diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v index 6c619c51ceb4a99a077fc61c52ce81763cfd27f5..b4e89628a44bb1f55c3445ee8e6866beada23585 100644 --- a/finn-rtllib/memstream/hdl/Q_srl.v +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -74,31 +74,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) parameter width = 16; // - width of data (i_d, o_d) - `define LOG2 ( (((depth)) ==0) ? 0 /* - depth==0 LOG2=0 */ \ - : (((depth-1)>>0)==0) ? 0 /* - depth<=1 LOG2=0 */ \ - : (((depth-1)>>1)==0) ? 1 /* - depth<=2 LOG2=1 */ \ - : (((depth-1)>>2)==0) ? 2 /* - depth<=4 LOG2=2 */ \ - : (((depth-1)>>3)==0) ? 3 /* - depth<=8 LOG2=3 */ \ - : (((depth-1)>>4)==0) ? 4 /* - depth<=16 LOG2=4 */ \ - : (((depth-1)>>5)==0) ? 5 /* - depth<=32 LOG2=5 */ \ - : (((depth-1)>>6)==0) ? 6 /* - depth<=64 LOG2=6 */ \ - : (((depth-1)>>7)==0) ? 7 /* - depth<=128 LOG2=7 */ \ - : 8) /* - depth<=256 LOG2=8 */ - -// parameter addrwidth = LOG2; // - width of queue addr - - parameter addrwidth = - ( (((depth)) ==0) ? 0 // - depth==0 LOG2=0 - : (((depth-1)>>0)==0) ? 0 // - depth<=1 LOG2=0 - : (((depth-1)>>1)==0) ? 1 // - depth<=2 LOG2=1 - : (((depth-1)>>2)==0) ? 2 // - depth<=4 LOG2=2 - : (((depth-1)>>3)==0) ? 3 // - depth<=8 LOG2=3 - : (((depth-1)>>4)==0) ? 4 // - depth<=16 LOG2=4 - : (((depth-1)>>5)==0) ? 5 // - depth<=32 LOG2=5 - : (((depth-1)>>6)==0) ? 6 // - depth<=64 LOG2=6 - : (((depth-1)>>7)==0) ? 7 // - depth<=128 LOG2=7 - : 8) // - depth<=256 LOG2=8 - ; + parameter addrwidth = $clog2(depth); input clock; input reset; diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py new file mode 100644 index 0000000000000000000000000000000000000000..798bbd335f0028d1103d992fd2b8b9cd30bbb6e1 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -0,0 +1,217 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import copy +import math +import numpy as np +import warnings +from finn.custom_op.registry import getCustomOp +from finn.transformation import Transformation +from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles +from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.general import GiveUniqueNodeNames +from finn.core.rtlsim_exec import ( + _reset_rtlsim, + _toggle_clk, +) +from finn.util.fpgadataflow import ( + pyverilate_stitched_ip, +) + + +def set_signal(sim, keyw, value): + for i in range(len(sim.inputs)): + input_name = sim.inputs[i][0] + if keyw in input_name: + sim.io[input_name] = value + + +def optimize_depth(depth): + if depth <= 2: + return 2 + if depth <= 32: + return 32 + if depth <= 1024: + return int(2 ** math.ceil(math.log2(depth))) + return int(math.ceil(depth / 1024)) + + +class SetFIFODepths(Transformation): + """Determines minimum depths of StreamingFIFOs through RTLSim. + We assume we get a dataflow partition (all nodes are dataflow, no FIFOs) + We set initial depths very high (16k), run sim with multiple + images on input (random/constant data) and keep track of maximum + occupancy counts in each FIFO.""" + + def __init__(self, fpgapart, clk_ns=10.0): + super().__init__() + self.fpgapart = fpgapart + self.clk_ns = clk_ns + + def apply(self, model): + + orig_model = model + + # work on a copy of the model + model = copy.deepcopy(model) + + # change external to decoupled and warn user; + # this way we are sure we have exactly one input/output + for node in model.graph.node: + node = getCustomOp(node) + node.set_nodeattr("inFIFODepth", 2 ** 14) + node.set_nodeattr("outFIFODepth", 2 ** 14) + if node.onnx_node.op_type == "StreamingFCLayer_Batch": + mmode = node.get_nodeattr("mem_mode") + if mmode == "external": + node.set_nodeattr("mem_mode", "decoupled") + warnings.warn( + "Changed mem_mode from external to decoupled for " + + node.onnx_node.name + ) + + # insert stream infrastructure (DWC/FIFO) + model = model.transform(InsertDWC()) + model = model.transform(InsertFIFO()) + model = model.transform(GiveUniqueNodeNames()) + + # gather FIFO names, check they are of expected depth + fifos = {} + for node in model.graph.node: + if node.op_type == "StreamingFIFO": + consumer = model.find_consumers(node.output[0]) + if consumer is not None: + consumer = consumer[0].name + producer = model.find_producer(node.input[0]) + if producer is not None: + producer = producer.name + fifos[node.name] = { + "depth": 0, + "consumer": consumer, + "producer": producer, + } + node = getCustomOp(node) + # check depths + # if model came in with FIFOs, the depths will not have been updated + if node.get_nodeattr("depth") != 2 ** 14: + node.set_nodeattr("depth", 2 ** 14) + + # insert FIFOs and do all transformations for RTLsim + model = model.transform(AnnotateCycles()) + perf = model.analysis(dataflow_performance) + latency = perf["critical_path_cycles"] + max_cycles = perf["max_cycles"] + model = model.transform(PrepareIP(self.fpgapart, self.clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns)) + model.set_metadata_prop("exec_mode", "rtlsim") + + # calculate input frequency (number of cycles for each input word) + first_node = getCustomOp(model.graph.node[0]) + ncycles_per_input = math.ceil( + perf["max_cycles"] + / ( + np.prod(first_node.get_folded_input_shape()) + / first_node.get_folded_input_shape()[-1] + ) + ) + + # set sufficiently large threshold for 1 image to fully execute and exit + ncycles = int(latency + max_cycles) + + # prepare pyverilator model + sim = pyverilate_stitched_ip(model) + + _reset_rtlsim(sim) + _toggle_clk(sim) + + # set all input valids to 0 and output readies to 1 + # set input data to some constant + set_signal(sim, "tvalid", 0) + set_signal(sim, "tready", 1) + set_signal(sim, "tdata", 0) + + while ncycles > 0: + _toggle_clk(sim) + # set/unset valids + if ncycles % ncycles_per_input == 0: + set_signal(sim, "tvalid", 1) + else: + set_signal(sim, "tvalid", 0) + + # check/update all fifo counts + for key in fifos: + current_state = sim.internals["finn_design_i"][key]["inst"][ + key + "_" + key + ]["state"] + current_addr = sim.internals["finn_design_i"][key]["inst"][ + key + "_" + key + ]["addr"] + if current_state == 2: + current_count = current_addr + 2 + else: + current_count = current_state + if current_count > fifos[key]["depth"]: + fifos[key]["depth"] = current_count + ncycles = ncycles - 1 + + # for each node in the original graph, determine in/outFIFODepth + ret = {} + for key in fifos: + predecessor_node = fifos[key]["producer"] + if predecessor_node is not None: + if predecessor_node not in ret: + ret[predecessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0} + out_depth = ret[predecessor_node]["outFIFODepth"] + ret[predecessor_node]["outFIFODepth"] = max( + out_depth, fifos[key]["depth"] + ) + + succcessor_node = fifos[key]["consumer"] + if succcessor_node is not None: + if succcessor_node not in ret: + ret[succcessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0} + in_depth = ret[succcessor_node]["inFIFODepth"] + ret[succcessor_node]["inFIFODepth"] = max(in_depth, fifos[key]["depth"]) + + # tweak and apply depths to original model + for node in orig_model.graph.node: + if node.name in ret: + depths = ret[node.name] + node = getCustomOp(node) + node.set_nodeattr("inFIFODepth", optimize_depth(depths["inFIFODepth"])) + node.set_nodeattr( + "outFIFODepth", optimize_depth(depths["outFIFODepth"]) + ) + + return (orig_model, False) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 29ecb2c7e49444cecade6d3321aaba3b9add4b9c..890d0db30afaf795f6b4ae439b3989d1f44beb67 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -80,6 +80,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles +from finn.transformation.fpgadataflow.set_fifo_depths import SetFIFODepths from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.modelwrapper import ModelWrapper from scipy.stats import linregress @@ -305,10 +306,22 @@ class TestEnd2End: @pytest.mark.slow @pytest.mark.vivado @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_ipstitch_rtlsim(self, topology, wbits, abits, kind): + def test_set_fifo_depths(self, topology, wbits, abits, kind): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind) model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(kind, target_clk_ns)["part"] + model = model.transform(SetFIFODepths(test_fpga_part, target_clk_ns)) + model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + kind)) + + @pytest.mark.slow + @pytest.mark.vivado + @pytest.mark.parametrize("kind", ["zynq", "alveo"]) + def test_ipstitch_rtlsim(self, topology, wbits, abits, kind): + prev_chkpt_name = get_checkpoint_name( + topology, wbits, abits, "fifodepth_" + kind + ) + model = load_test_checkpoint_or_skip(prev_chkpt_name) + test_fpga_part = get_build_env(kind, target_clk_ns)["part"] model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(GiveUniqueNodeNames()) @@ -326,7 +339,9 @@ class TestEnd2End: "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits) ) os.environ["RTLSIM_TRACE_DEPTH"] = "3" - rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind) + rtlsim_chkpt = get_checkpoint_name( + topology, wbits, abits, "ipstitch_rtlsim_" + kind + ) model.save(rtlsim_chkpt) parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( @@ -342,7 +357,9 @@ class TestEnd2End: @pytest.mark.vivado @pytest.mark.parametrize("kind", ["zynq", "alveo"]) def test_throughput_rtlsim(self, topology, wbits, abits, kind): - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind) + prev_chkpt_name = get_checkpoint_name( + topology, wbits, abits, "ipstitch_rtlsim_" + kind + ) model = load_test_checkpoint_or_skip(prev_chkpt_name) n_nodes = len(model.graph.node) perf_est = model.analysis(dataflow_performance) @@ -361,7 +378,9 @@ class TestEnd2End: def test_build(self, topology, wbits, abits, kind): if kind == "alveo" and ("VITIS_PATH" not in os.environ): pytest.skip("VITIS_PATH not set") - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind) + prev_chkpt_name = get_checkpoint_name( + topology, wbits, abits, "fifodepth_" + kind + ) model = load_test_checkpoint_or_skip(prev_chkpt_name) cfg = get_build_env(kind, target_clk_ns) model = model.transform(cfg["build_fxn"])