diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index 6dcf99e7589e4e0e6c50b626ea53948f6153ae3c..e13f0d0211ce4c140c8ccba1a4d4832cf1fc2a17 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -12,11 +12,11 @@ gecho () { # checkout the correct dependency repo commits # the repos themselves are cloned in the Dockerfile -FINN_BASE_COMMIT=ceb1219b5aba396dde41967a929e1f08887653ce +FINN_BASE_COMMIT=c4d8885e38a55f9bb7424bde76d35a3e000c5a7e BREVITAS_COMMIT=6ffefa8dbf37fdb0f44c994f34604c29fadb16b0 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e -PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f +PYVERILATOR_COMMIT=06c29ecf3ba0361e3d0a75c98f6918ba67bf0e27 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada gecho "Setting up known-good commit versions for FINN dependencies" diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v index 6c619c51ceb4a99a077fc61c52ce81763cfd27f5..b4e89628a44bb1f55c3445ee8e6866beada23585 100644 --- a/finn-rtllib/memstream/hdl/Q_srl.v +++ b/finn-rtllib/memstream/hdl/Q_srl.v @@ -74,31 +74,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count); parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) parameter width = 16; // - width of data (i_d, o_d) - `define LOG2 ( (((depth)) ==0) ? 0 /* - depth==0 LOG2=0 */ \ - : (((depth-1)>>0)==0) ? 0 /* - depth<=1 LOG2=0 */ \ - : (((depth-1)>>1)==0) ? 1 /* - depth<=2 LOG2=1 */ \ - : (((depth-1)>>2)==0) ? 2 /* - depth<=4 LOG2=2 */ \ - : (((depth-1)>>3)==0) ? 3 /* - depth<=8 LOG2=3 */ \ - : (((depth-1)>>4)==0) ? 4 /* - depth<=16 LOG2=4 */ \ - : (((depth-1)>>5)==0) ? 5 /* - depth<=32 LOG2=5 */ \ - : (((depth-1)>>6)==0) ? 6 /* - depth<=64 LOG2=6 */ \ - : (((depth-1)>>7)==0) ? 7 /* - depth<=128 LOG2=7 */ \ - : 8) /* - depth<=256 LOG2=8 */ - -// parameter addrwidth = LOG2; // - width of queue addr - - parameter addrwidth = - ( (((depth)) ==0) ? 0 // - depth==0 LOG2=0 - : (((depth-1)>>0)==0) ? 0 // - depth<=1 LOG2=0 - : (((depth-1)>>1)==0) ? 1 // - depth<=2 LOG2=1 - : (((depth-1)>>2)==0) ? 2 // - depth<=4 LOG2=2 - : (((depth-1)>>3)==0) ? 3 // - depth<=8 LOG2=3 - : (((depth-1)>>4)==0) ? 4 // - depth<=16 LOG2=4 - : (((depth-1)>>5)==0) ? 5 // - depth<=32 LOG2=5 - : (((depth-1)>>6)==0) ? 6 // - depth<=64 LOG2=6 - : (((depth-1)>>7)==0) ? 7 // - depth<=128 LOG2=7 - : 8) // - depth<=256 LOG2=8 - ; + parameter addrwidth = $clog2(depth); input clock; input reset; diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py index f9a9dc4340b18578550a9c453d90de86234d1cad..95ecc5f10525456e7f5a6d838e0850adaee5415f 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py @@ -48,7 +48,7 @@ class FMPadding_Batch(HLSCustomOp): simd = self.get_nodeattr("SIMD") batch_size = self.get_nodeattr("numInputVectors") exp_cycles = (channels / simd) * batch_size * odim * odim - return exp_cycles + return int(exp_cycles) def get_normal_input_shape(self): idim = self.get_nodeattr("ImgDim") diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py index 1a75858880a072345ef942ca91feabf0bec9ab36..56f1a9d56d9da7057e3cbe61f3d92877e58087d6 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py @@ -187,7 +187,7 @@ class GlobalAccPool_Batch(HLSCustomOp): ch = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") folds = int(ch / pe) - return np.prod(self.get_folded_input_shape()[:-1]) + folds + return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index e2f96395ad74255ad67549255608cd52737e97d9..6b422ed17267f110d97a95cad166baf6f9aee890 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -30,6 +30,7 @@ import numpy as np from shutil import copy import subprocess import math +import warnings from finn.custom_op.fpgadataflow import HLSCustomOp from finn.core.datatype import DataType @@ -178,14 +179,11 @@ class StreamingFIFO(HLSCustomOp): depth = self.get_nodeattr("depth") # depth has to be between 2 and 256 with the current # StreamingFIFO implementation - assert ( - depth >= 2 - ), """Depth is too low. Please set node attribute "depth" to a value - between 2 and 256""" - assert ( - depth <= 256 - ), """Depth is too high. Please set node attribute "depth" to a value - between 2 and 256""" + assert depth >= 2, """Depth is too low""" + if depth > 256 and self.get_nodeattr("impl_style") == "rtl": + warnings.warn( + "Depth is high, set between 2 and 256 for efficient SRL implementation" + ) # derive normal shape from folded shape # StreamingFIFOs are inserted in between fpgadataflow nodes # the folded shape could be for example (1, nf, pe) @@ -424,7 +422,6 @@ class StreamingFIFO(HLSCustomOp): else: return (math.ceil(depth / 4096)) * (math.ceil(W / 72)) - def bram_efficiency_estimation(self): depth = self.get_nodeattr("depth") W = self.get_instream_width() @@ -451,3 +448,9 @@ class StreamingFIFO(HLSCustomOp): return int(address_luts + ram_luts) + def prepare_rtlsim(self): + assert self.get_nodeattr("impl_style") != "vivado", ( + "StreamingFIFO impl_style " + "cannot be vivado for rtlsim. Only impl_style=rtl supported." + ) + super().prepare_rtlsim() diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py index 4c772358648f402467cee628afe410d7bce83ede..53bcab993b25173c8620d7f4a6694a8efaf74c4d 100644 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py @@ -99,7 +99,7 @@ class StreamingMaxPool_Batch(HLSCustomOp): # derived from StreamingMaxPool_Batch loop nest k = self.get_nodeattr("PoolDim") ifm_dim = self.get_nodeattr("ImgDim") - return ifm_dim * (ifm_dim + (ifm_dim / k)) + return int(ifm_dim * (ifm_dim + (ifm_dim / k))) def get_instream_width(self): dt_bits = self.get_input_datatype().bitwidth() diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 1c26642888f29e8dd08046e4de01ae8fa62b10e7..a3056aaa15a5f00cdc7b33f5dba83820c76dfa10 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -4,6 +4,7 @@ from onnx import helper as oh from finn.custom_op.registry import getCustomOp from finn.transformation.base import Transformation from finn.util.fpgadataflow import is_fpgadataflow_node +import warnings import numpy as np @@ -56,66 +57,81 @@ class InsertFIFO(Transformation): for n in graph.node: node_ind += 1 if _suitable_node(n): - n_output = n.output[0] - consumer = model.find_consumer(n_output) - if _suitable_node(consumer) is True: - n0 = getCustomOp(n) - # determine fifo node attributes - fld_shape = n0.get_folded_output_shape() - dtype = n0.get_output_datatype() - - # check if folded_shape of output of first node and - # input of the second node is equal - n1 = getCustomOp(consumer) - fld_shape_2 = n1.get_folded_input_shape() - assert _suitable_folded_shapes( - fld_shape, fld_shape_2 - ), """The - folded output shape of the first node is not the same as the - folded output shape of the second node. A streaming fifo can't - be implemented in between these nodes.""" - - # check if outFIFOdepth attribute of first node - # and inFIFOdepth attribute of consumer node is equal - n0_depth = n0.get_nodeattr("outFIFODepth") - n1_depth = n1.get_nodeattr("inFIFODepth") - if n0_depth == n1_depth: - fifo_depth = n0_depth - elif n0_depth != n1_depth: - fifo_depth = max(n0_depth, n1_depth) - - if fifo_depth > 2: - # assumption: HLS streaming components already have - # depth-2 FIFOs on inputs and outputs, so no point - # creating additional small FIFOs in between -- - # we only create the larger FIFOs specified - # create fifo node - fifo_output_tensor = oh.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - n0.get_normal_output_shape(), + for n_output in n.output: + consumers = model.find_consumers(n_output) + if consumers is None: + continue + if len(consumers) > 1: + warnings.warn( + n.name + + ": HLS node with fan-out higher than 1 cannot be stitched" ) - graph.value_info.append(fifo_output_tensor) - model.set_tensor_datatype(fifo_output_tensor.name, dtype) - - fifo_node = oh.make_node( - "StreamingFIFO", - [n_output], - [fifo_output_tensor.name], - domain="finn", - backend="fpgadataflow", - depth=fifo_depth, - folded_shape=fld_shape, - dataType=str(dtype.name), - ) - # insert fifo - graph.node.insert(node_ind + 1, fifo_node) - # set fifo output tensor as new input tensor of second node - consumer.input[0] = fifo_output_tensor.name - # ensure created FIFO depth is reflected on both sides - n0.set_nodeattr("outFIFODepth", fifo_depth) - n1.set_nodeattr("inFIFODepth", fifo_depth) - graph_modified = True + consumer = consumers[0] + if _suitable_node(consumer) is True: + n0 = getCustomOp(n) + # determine fifo node attributes + fld_shape = n0.get_folded_output_shape() + dtype = n0.get_output_datatype() + + # check if folded_shape of output of first node and + # input of the second node is equal + n1 = getCustomOp(consumer) + for idx, inp in enumerate(consumer.input): + if inp == n_output: + if idx == 0: + fld_shape_2 = n1.get_folded_input_shape() + else: + fld_shape_2 = n1.get_folded_input_shape(ind=idx) + assert _suitable_folded_shapes( + fld_shape, fld_shape_2 + ), """The + folded output shape of the first node is not the same as the + folded output shape of the second node. A streaming fifo can't + be implemented in between these nodes.""" + + # check if outFIFOdepth attribute of first node + # and inFIFOdepth attribute of consumer node is equal + n0_depth = n0.get_nodeattr("outFIFODepth") + n1_depth = n1.get_nodeattr("inFIFODepth") + if n0_depth == n1_depth: + fifo_depth = n0_depth + elif n0_depth != n1_depth: + fifo_depth = max(n0_depth, n1_depth) + + if fifo_depth > 2: + # assumption: HLS streaming components already have + # depth-2 FIFOs on inputs and outputs, so no point + # creating additional small FIFOs in between -- + # we only create the larger FIFOs specified + # create fifo node + fifo_output_tensor = oh.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + n0.get_normal_output_shape(), + ) + graph.value_info.append(fifo_output_tensor) + model.set_tensor_datatype(fifo_output_tensor.name, dtype) + + fifo_node = oh.make_node( + "StreamingFIFO", + [n_output], + [fifo_output_tensor.name], + domain="finn", + backend="fpgadataflow", + depth=fifo_depth, + folded_shape=fld_shape, + dataType=str(dtype.name), + ) + # insert fifo + graph.node.insert(node_ind + 1, fifo_node) + # set fifo output tensor as new input tensor of second node + for idx, inp in enumerate(consumer.input): + if inp == n_output: + consumer.input[idx] = fifo_output_tensor.name + # ensure created FIFO depth is reflected on both sides + n0.set_nodeattr("outFIFODepth", fifo_depth) + n1.set_nodeattr("inFIFODepth", fifo_depth) + graph_modified = True if graph_modified is False: # insert FIFO as first node, except when first node is DMA @@ -131,6 +147,10 @@ class InsertFIFO(Transformation): dtype = n0.get_input_datatype() fifo_depth = n0.get_nodeattr("inFIFODepth") + if fifo_depth <= 2: + warnings.warn("Overriding input FIFO depth to 32") + fifo_depth = 32 + # create fifo node fifo_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), @@ -173,6 +193,10 @@ class InsertFIFO(Transformation): dtype = n0.get_output_datatype() fifo_depth = n0.get_nodeattr("outFIFODepth") + if fifo_depth <= 2: + warnings.warn("Overriding output FIFO depth to 32") + fifo_depth = 32 + # create fifo node fifo_input_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), @@ -180,7 +204,7 @@ class InsertFIFO(Transformation): n0.get_normal_output_shape(), ) graph.value_info.append(fifo_input_tensor) - model.set_tensor_datatype(fifo_output_tensor.name, dtype) + model.set_tensor_datatype(fifo_input_tensor.name, dtype) fifo_node = oh.make_node( "StreamingFIFO", diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py new file mode 100644 index 0000000000000000000000000000000000000000..713148d7fcdfea4411554b6d3b817a14b33a53c6 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -0,0 +1,392 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import warnings +from finn.custom_op.registry import getCustomOp +from finn.transformation.base import Transformation +from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles +from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames +from finn.core.rtlsim_exec import ( + _reset_rtlsim, + _toggle_clk, +) +from finn.util.fpgadataflow import pyverilate_stitched_ip, is_fpgadataflow_node + + +def reset_implementation(node): + node.set_nodeattr("code_gen_dir_ipgen", "") + node.set_nodeattr("ipgen_path", "") + node.set_nodeattr("ip_path", "") + + +def set_signal(sim, keyw, value): + for i in range(len(sim.inputs)): + input_name = sim.inputs[i][0] + if keyw in input_name: + sim.io[input_name] = value + + +def get_signal(sim, keyw): + for i in range(len(sim.outputs)): + output_name = sim.outputs[i][0] + if keyw in output_name: + return sim.io[output_name] + + +def optimize_depth(depth): + if depth <= 2: + return 2 + if depth <= 32: + # Q_srl FIFOs do not benefit from size < 32 + # add some slack + return 32 + # round to nearest power of two for Vivado IP FIFO implementation + return int(2 ** math.ceil(math.log2(depth))) + + +class RemoveShallowFIFOs(Transformation): + """Remove small FIFOs as the streaming components have depth-2 FIFOs on the + input/outputs by default.""" + + # TODO add unit test + + def __init__(self, shallow_threshold=2): + self.shallow_threshold = shallow_threshold + + def apply(self, model): + shallow_fifos = [] + for node in model.graph.node: + if ( + node.op_type == "StreamingFIFO" + and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold + ): + # bypass shallow fifos + shallow_fifos.append(node) + consumers = model.find_consumers(node.output[0]) + if consumers is None: + producer = model.find_producer(node.input[0]) + for idx, inp in enumerate(producer.output): + if inp == node.input[0]: + producer.output[idx] = node.output[0] + else: + assert len(consumers) == 1, "Fanout detected from FIFO output" + consumer = consumers[0] + # set fifo input tensor as new input tensor of second node + for idx, inp in enumerate(consumer.input): + if inp == node.output[0]: + consumer.input[idx] = node.input[0] + # now filter out + for node_to_remove in shallow_fifos: + model.graph.node.remove(node_to_remove) + + return (model, False) + + +class CapConvolutionFIFODepths(Transformation): + """Make the size of FIFOs for convolution layers smaller where possible. + Will be automatically called from InsertAndSetFIFODepths if the appropriate + constructor flag is set. + + Constructor arguments: + - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of + Verilog FIFOs (Q_srl.v) + + Assumed input graph properties: + - all nodes are fpgadataflow nodes + - FIFOs inserted with InsertAndSetFIFODepths + + Output: + - graph with smaller-depth FIFOs for convolutions + + Background: + The simulation-based rtlsim_exec tends to overestimate the required depth + of FIFOs between the ConvolutionInputGenerator (here called SWG) and the + StreamingFCLayer (here called MVAU). As the SWG has an internal buffer of 1 + image row, we use this as a rule of thumb to set FIFO depth to be no larger + than 1 row. + """ + + # TODO add unit test + + def __init__(self, max_qsrl_depth=256): + super().__init__() + self.max_qsrl_depth = max_qsrl_depth + + def apply(self, model): + # TODO move this to own transformation + for node in model.graph.node: + # look for following pattern: + # ConvolutionInputGenerator -> StreamingFIFO -> StreamingFCLayer + if node.op_type == "StreamingFIFO": + fifo_prod = model.find_producer(node.input[0]) + fifo_cons = model.find_consumer(node.output[0]) + if fifo_prod is None: + continue + if fifo_prod.op_type != "ConvolutionInputGenerator": + continue + if fifo_cons is None: + continue + if fifo_cons.op_type != "StreamingFCLayer_Batch": + continue + op_inst = getCustomOp(node) + depth = op_inst.get_nodeattr("depth") + # SWG has an internal buffer of 1 row, so we use this as a + # rule of thumb to set FIFO depth to be no larger than 1 row + (bs, h, w, ifold, simd) = op_inst.get_folded_input_shape() + new_depth = optimize_depth(w * ifold) + new_depth = min(new_depth, depth) + op_inst.set_nodeattr("depth", new_depth) + # Set FIFO implementation/ram styles + if new_depth > self.max_qsrl_depth: + op_inst.set_nodeattr("impl_style", "vivado") + op_inst.set_nodeattr("ram_style", "auto") + else: + op_inst.set_nodeattr("impl_style", "rtl") + + return (model, False) + + +class InsertAndSetFIFODepths(Transformation): + """Insert appropriate-depth StreamingFIFOs through RTLSim that preserve + throughput in the created accelerator. + + Constructor arguments: + - clk_ns : clock period (used for IP preparation) + - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of + Verilog FIFOs (Q_srl.v) + - max_depth : how deep the "max"-sized FIFOs initially inserted will be + - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs + smaller where appropriate + + Assumed input graph properties: + - all nodes are fpgadataflow nodes + - no FIFOs inserted, + - (inFIFODepth/outFIFODepth attrs will be ignored) + + Output: + - graph with appropriate-depth FIFOs inserted + + Background: + Even with all FINN HLS fpgadatflow layers appropriately parallelized, it is + necessary to insert FIFOs between them to prevent stalls due to bursty + behavior. The sizes of those FIFOs are hard to predict analytically, so + we do the following: + - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes + - create stitched design + - run through rtlsim with stream of multiple random input images (to fill pipeline) + - keep track of observed maximum occupancy for each FIFO during rtlsim + - when sim finished, update each FIFO depth to maximum observed occupancy + and set inFIFODepth/outFIFODepth attrs to 0 on relevant nodes + """ + + def __init__( + self, + fpgapart, + clk_ns=10.0, + max_qsrl_depth=256, + max_depth=2 ** 14, + swg_exception=True, + ): + super().__init__() + self.fpgapart = fpgapart + self.clk_ns = clk_ns + self.max_qsrl_depth = max_qsrl_depth + self.max_depth = max_depth + self.swg_exception = swg_exception + + def apply(self, model): + # change external to decoupled and warn user + # this way we are sure we have exactly one input/output + modified_fc_nodes = [] + for node in model.graph.node: + # verify assumptions + assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str( + node + ) + assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" + node = getCustomOp(node) + node.set_nodeattr("inFIFODepth", self.max_depth) + node.set_nodeattr("outFIFODepth", self.max_depth) + if node.onnx_node.op_type == "StreamingFCLayer_Batch": + mmode = node.get_nodeattr("mem_mode") + if mmode == "external": + modified_fc_nodes.append(node.onnx_node.name) + node.set_nodeattr("mem_mode", "decoupled") + reset_implementation(node) + warnings.warn( + "Changed mem_mode from external to decoupled for " + + node.onnx_node.name + ) + + # insert stream infrastructure (DWC/FIFO) + model = model.transform(InsertDWC()) + model = model.transform(InsertFIFO()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # gather FIFO names, check they are of expected depth + fifos = {} + for node in model.graph.node: + if node.op_type == "StreamingFIFO": + fifos[node.name] = 0 + node = getCustomOp(node) + # check depths and fix as necessary + if node.get_nodeattr("depth") != self.max_depth: + node.set_nodeattr("depth", self.max_depth) + + # insert FIFOs and do all transformations for RTLsim + model = model.transform(AnnotateCycles()) + perf = model.analysis(dataflow_performance) + latency = perf["critical_path_cycles"] + max_cycles = perf["max_cycles"] + model = model.transform(PrepareIP(self.fpgapart, self.clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns)) + model.set_metadata_prop("exec_mode", "rtlsim") + + # calculate input frequency (number of cycles for each input word) + first_node = getCustomOp(model.graph.node[0]) + ncycles_per_input = max( + 1, + int( + math.ceil( + perf["max_cycles"] + / ( + np.prod(first_node.get_folded_input_shape()) + / first_node.get_folded_input_shape()[-1] + ) + ) + ), + ) + + # set sufficiently large threshold for 1 image to fully execute and exit + ncycles = int(latency + max_cycles) + + # prepare pyverilator model + sim = pyverilate_stitched_ip(model) + + _reset_rtlsim(sim) + _toggle_clk(sim) + + # set all input valids to 0 and output readies to 1 + # set input data to some constant + set_signal(sim, "tvalid", 0) + set_signal(sim, "tready", 1) + set_signal(sim, "tdata", 0) + + output_detected = False + while ncycles > 0: + _toggle_clk(sim) + # set/unset valids + if ncycles % ncycles_per_input == 0: + set_signal(sim, "tvalid", 1) + else: + set_signal(sim, "tvalid", 0) + + # check/update all fifo counts + for key in fifos: + current_state = sim.internals["finn_design_i"][key]["inst"][ + key + "_" + key + ]["state"] + current_addr = sim.internals["finn_design_i"][key]["inst"][ + key + "_" + key + ]["addr"] + if current_state == 2: + current_count = current_addr + 2 + else: + current_count = current_state + if current_count > fifos[key]: + fifos[key] = current_count + + # since latency estimation is very pessimistic, detect first output + # and fast-forward the sim + if get_signal(sim, "tvalid") != 0 and not output_detected: + ncycles = max_cycles + output_detected = True + else: + ncycles = ncycles - 1 + + if not output_detected: + warnings.warn( + "No output detected, calculated FIFO depths may not be correct" + ) + + # Apply depths back into the model; + # also set in/outFIFODepth to zero for non-FIFO + # nodes, preventing further FIFO insertion + for node in model.graph.node: + # set FIFO depth, reset FIFO implementation, + # and set implementation/ram styles + if node.op_type == "StreamingFIFO": + assert node.name in fifos, "FIFO node not found in size dictionary" + # set depth of FIFO + depth = optimize_depth(fifos[node.name]) + node_inst = getCustomOp(node) + node_inst.set_nodeattr("depth", depth) + # Set FIFO implementation/ram styles + if depth > self.max_qsrl_depth: + node_inst.set_nodeattr("impl_style", "vivado") + node_inst.set_nodeattr("ram_style", "auto") + else: + node_inst.set_nodeattr("impl_style", "rtl") + # reset implementation + reset_implementation(node_inst) + del fifos[node.name] + else: + getCustomOp(node).set_nodeattr("inFIFODepth", 0) + getCustomOp(node).set_nodeattr("outFIFODepth", 0) + # for every FC node we changed from external to decoupled, + # change back and reset implementation + if node.op_type == "StreamingFCLayer_Batch": + if node.name in modified_fc_nodes: + node_inst = getCustomOp(node) + node_inst.set_nodeattr("mem_mode", "external") + reset_implementation(node_inst) + modified_fc_nodes.remove(node.name) + + assert ( + len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0 + ), "FIFO/FC nodes left untouched after model reconfiguration" + + # handle custom sizing for SWG FIFOs if desired + if self.swg_exception: + model = model.transform( + CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth) + ) + # remove shallow FIFOs + model = model.transform(RemoveShallowFIFOs()) + + return (model, False) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 4eed1a260974e4f842e9e93756caff135c5fbdde..7a428b8592e0e67dd8561f1425482a006a79479a 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -82,8 +82,8 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.insert_dwc import InsertDWC -from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.modelwrapper import ModelWrapper from scipy.stats import linregress @@ -128,19 +128,17 @@ def update_dashboard_data(topology, wbits, abits, key, val): def fold_tfc(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") - # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer + # (PE, SIMD, ramstyle) for each layer config = [ - (16, 49, 16, 64, "block"), - (8, 8, 64, 64, "auto"), - (8, 8, 64, 64, "auto"), - (10, 8, 64, 10, "distributed"), + (16, 49, "block"), + (8, 8, "auto"), + (8, 8, "auto"), + (10, 8, "distributed"), ] - for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config): + for fcl, (pe, simd, ramstyle) in zip(fc_layers, config): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) - fcl_inst.set_nodeattr("inFIFODepth", ififo) - fcl_inst.set_nodeattr("outFIFODepth", ofifo) fcl_inst.set_nodeattr("ram_style", ramstyle) # set parallelism for input quantizer to be same as first layer's SIMD inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] @@ -151,62 +149,56 @@ def fold_tfc(model): def fold_cnv_large(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") - # each tuple is (PE, SIMD, in_fifo_depth) for a layer + # each tuple is (PE, SIMD) for a layer folding = [ - (16, 3, 256), - (32, 32, 256), - (16, 32, 256), - (16, 32, 256), - (4, 32, 214), - (1, 32, 2), - (1, 4, 126), - (1, 8, 62), - (5, 1, 6), + (16, 3), + (32, 32), + (16, 32), + (16, 32), + (4, 32), + (1, 32), + (1, 4), + (1, 8), + (5, 1), ] - for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding): + for fcl, (pe, simd) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) - fcl_inst.set_nodeattr("inFIFODepth", ififodepth) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") - swg_idepth = [2, 51, 9, 106, 2, 2] for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) - swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i]) return model def fold_cnv_small(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") - # each tuple is (PE, SIMD, in_fifo_depth) for a layer + # each tuple is (PE, SIMD) for a layer folding = [ - (8, 3, 256, "auto"), - (16, 16, 256, "auto"), - (8, 16, 256, "auto"), - (8, 16, 256, "block"), - (4, 8, 214, "auto"), - (1, 8, 2, "auto"), - (1, 2, 126, "distributed"), - (2, 2, 62, "block"), - (5, 1, 6, "distributed"), + (8, 3, "auto"), + (16, 16, "auto"), + (8, 16, "auto"), + (8, 16, "block"), + (4, 8, "auto"), + (1, 8, "auto"), + (1, 2, "distributed"), + (2, 2, "block"), + (5, 1, "distributed"), ] - for fcl, (pe, simd, ififodepth, ramstyle) in zip(fc_layers, folding): + for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) - fcl_inst.set_nodeattr("inFIFODepth", ififodepth) fcl_inst.set_nodeattr("ram_style", ramstyle) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") - swg_idepth = [2, 51, 9, 106, 2, 2] for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) - swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i]) return model @@ -446,19 +438,41 @@ class TestEnd2End: model = model.transform(HLSSynthIP()) model.save(get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)) + @pytest.mark.slow + @pytest.mark.vivado + @pytest.mark.parametrize("kind", ["zynq", "alveo"]) + def test_set_fifo_depths(self, topology, wbits, abits, kind): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind) + model = load_test_checkpoint_or_skip(prev_chkpt_name) + test_fpga_part = get_build_env(kind, target_clk_ns)["part"] + model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) + fifo_layers = model.get_nodes_by_op_type("StreamingFIFO") + assert len(fifo_layers) > 0 + hls_layers = model.get_finn_nodes() + for node in hls_layers: + if node.op_type != "StreamingFIFO": + op_inst = getCustomOp(node) + assert op_inst.get_nodeattr("inFIFODepth") == 0 + assert op_inst.get_nodeattr("outFIFODepth") == 0 + model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + kind)) + @pytest.mark.slow @pytest.mark.vivado @pytest.mark.parametrize("kind", ["zynq"]) def test_ipstitch_rtlsim(self, topology, wbits, abits, kind): - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind) + prev_chkpt_name = get_checkpoint_name( + topology, wbits, abits, "fifodepth_" + kind + ) model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(kind, target_clk_ns)["part"] model = model.transform(InsertDWC()) - model = model.transform(InsertFIFO()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] + # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that + for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO"): + getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) @@ -533,7 +547,9 @@ class TestEnd2End: def test_build(self, topology, wbits, abits, kind): if kind == "alveo" and ("VITIS_PATH" not in os.environ): pytest.skip("VITIS_PATH not set") - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind) + prev_chkpt_name = get_checkpoint_name( + topology, wbits, abits, "fifodepth_" + kind + ) model = load_test_checkpoint_or_skip(prev_chkpt_name) cfg = get_build_env(kind, target_clk_ns) model = model.transform(cfg["build_fxn"])