diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 6dcf99e7589e4e0e6c50b626ea53948f6153ae3c..e13f0d0211ce4c140c8ccba1a4d4832cf1fc2a17 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,11 +12,11 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=ceb1219b5aba396dde41967a929e1f08887653ce
+FINN_BASE_COMMIT=c4d8885e38a55f9bb7424bde76d35a3e000c5a7e
 BREVITAS_COMMIT=6ffefa8dbf37fdb0f44c994f34604c29fadb16b0
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=cfafe11a93b79ab1af7529d68f08886913a6466e
-PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
+PYVERILATOR_COMMIT=06c29ecf3ba0361e3d0a75c98f6918ba67bf0e27
 OMX_COMMIT=1bae737669901e762f581af73348332b5c4b2ada
 
 gecho "Setting up known-good commit versions for FINN dependencies"
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index 6c619c51ceb4a99a077fc61c52ce81763cfd27f5..b4e89628a44bb1f55c3445ee8e6866beada23585 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -74,31 +74,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
 
-   `define LOG2 (  (((depth))     ==0) ? 0	/* - depth==0   LOG2=0 */ \
-		 : (((depth-1)>>0)==0) ? 0	/* - depth<=1   LOG2=0 */ \
-		 : (((depth-1)>>1)==0) ? 1	/* - depth<=2   LOG2=1 */ \
-		 : (((depth-1)>>2)==0) ? 2	/* - depth<=4   LOG2=2 */ \
-		 : (((depth-1)>>3)==0) ? 3	/* - depth<=8   LOG2=3 */ \
-		 : (((depth-1)>>4)==0) ? 4	/* - depth<=16  LOG2=4 */ \
-		 : (((depth-1)>>5)==0) ? 5	/* - depth<=32  LOG2=5 */ \
-		 : (((depth-1)>>6)==0) ? 6	/* - depth<=64  LOG2=6 */ \
-		 : (((depth-1)>>7)==0) ? 7	/* - depth<=128 LOG2=7 */ \
-		 :                       8)	/* - depth<=256 LOG2=8 */
-
-// parameter addrwidth = LOG2;			// - width of queue addr
-
-   parameter addrwidth =
-		(  (((depth))     ==0) ? 0	// - depth==0   LOG2=0
-		 : (((depth-1)>>0)==0) ? 0	// - depth<=1   LOG2=0
-		 : (((depth-1)>>1)==0) ? 1	// - depth<=2   LOG2=1
-		 : (((depth-1)>>2)==0) ? 2	// - depth<=4   LOG2=2
-		 : (((depth-1)>>3)==0) ? 3	// - depth<=8   LOG2=3
-		 : (((depth-1)>>4)==0) ? 4	// - depth<=16  LOG2=4
-		 : (((depth-1)>>5)==0) ? 5	// - depth<=32  LOG2=5
-		 : (((depth-1)>>6)==0) ? 6	// - depth<=64  LOG2=6
-		 : (((depth-1)>>7)==0) ? 7	// - depth<=128 LOG2=7
-		 :                       8)	// - depth<=256 LOG2=8
-		 ;
+   parameter addrwidth = $clog2(depth);
 
    input     clock;
    input     reset;
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index f9a9dc4340b18578550a9c453d90de86234d1cad..95ecc5f10525456e7f5a6d838e0850adaee5415f 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -48,7 +48,7 @@ class FMPadding_Batch(HLSCustomOp):
         simd = self.get_nodeattr("SIMD")
         batch_size = self.get_nodeattr("numInputVectors")
         exp_cycles = (channels / simd) * batch_size * odim * odim
-        return exp_cycles
+        return int(exp_cycles)
 
     def get_normal_input_shape(self):
         idim = self.get_nodeattr("ImgDim")
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index 1a75858880a072345ef942ca91feabf0bec9ab36..56f1a9d56d9da7057e3cbe61f3d92877e58087d6 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -187,7 +187,7 @@ class GlobalAccPool_Batch(HLSCustomOp):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         folds = int(ch / pe)
-        return np.prod(self.get_folded_input_shape()[:-1]) + folds
+        return int(np.prod(self.get_folded_input_shape()[:-1]) + folds)
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index e2f96395ad74255ad67549255608cd52737e97d9..6b422ed17267f110d97a95cad166baf6f9aee890 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -30,6 +30,7 @@ import numpy as np
 from shutil import copy
 import subprocess
 import math
+import warnings
 
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
@@ -178,14 +179,11 @@ class StreamingFIFO(HLSCustomOp):
         depth = self.get_nodeattr("depth")
         # depth has to be between 2 and 256 with the current
         # StreamingFIFO implementation
-        assert (
-            depth >= 2
-        ), """Depth is too low. Please set node attribute "depth" to a value
-        between 2 and 256"""
-        assert (
-            depth <= 256
-        ), """Depth is too high. Please set node attribute "depth" to a value
-        between 2 and 256"""
+        assert depth >= 2, """Depth is too low"""
+        if depth > 256 and self.get_nodeattr("impl_style") == "rtl":
+            warnings.warn(
+                "Depth is high, set between 2 and 256 for efficient SRL implementation"
+            )
         # derive normal shape from folded shape
         # StreamingFIFOs are inserted in between fpgadataflow nodes
         # the folded shape could be for example (1, nf, pe)
@@ -424,7 +422,6 @@ class StreamingFIFO(HLSCustomOp):
         else:
             return (math.ceil(depth / 4096)) * (math.ceil(W / 72))
 
-
     def bram_efficiency_estimation(self):
         depth = self.get_nodeattr("depth")
         W = self.get_instream_width()
@@ -451,3 +448,9 @@ class StreamingFIFO(HLSCustomOp):
 
         return int(address_luts + ram_luts)
 
+    def prepare_rtlsim(self):
+        assert self.get_nodeattr("impl_style") != "vivado", (
+            "StreamingFIFO impl_style "
+            "cannot be vivado for rtlsim. Only impl_style=rtl supported."
+        )
+        super().prepare_rtlsim()
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 4c772358648f402467cee628afe410d7bce83ede..53bcab993b25173c8620d7f4a6694a8efaf74c4d 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -99,7 +99,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         # derived from StreamingMaxPool_Batch loop nest
         k = self.get_nodeattr("PoolDim")
         ifm_dim = self.get_nodeattr("ImgDim")
-        return ifm_dim * (ifm_dim + (ifm_dim / k))
+        return int(ifm_dim * (ifm_dim + (ifm_dim / k)))
 
     def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 1c26642888f29e8dd08046e4de01ae8fa62b10e7..a3056aaa15a5f00cdc7b33f5dba83820c76dfa10 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -4,6 +4,7 @@ from onnx import helper as oh
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.base import Transformation
 from finn.util.fpgadataflow import is_fpgadataflow_node
+import warnings
 import numpy as np
 
 
@@ -56,66 +57,81 @@ class InsertFIFO(Transformation):
         for n in graph.node:
             node_ind += 1
             if _suitable_node(n):
-                n_output = n.output[0]
-                consumer = model.find_consumer(n_output)
-                if _suitable_node(consumer) is True:
-                    n0 = getCustomOp(n)
-                    # determine fifo node attributes
-                    fld_shape = n0.get_folded_output_shape()
-                    dtype = n0.get_output_datatype()
-
-                    # check if folded_shape of output of first node and
-                    # input of the second node is equal
-                    n1 = getCustomOp(consumer)
-                    fld_shape_2 = n1.get_folded_input_shape()
-                    assert _suitable_folded_shapes(
-                        fld_shape, fld_shape_2
-                    ), """The
-                    folded output shape of the first node is not the same as the
-                    folded output shape of the second node. A streaming fifo can't
-                    be implemented in between these nodes."""
-
-                    # check if outFIFOdepth attribute of first node
-                    # and inFIFOdepth attribute of consumer node is equal
-                    n0_depth = n0.get_nodeattr("outFIFODepth")
-                    n1_depth = n1.get_nodeattr("inFIFODepth")
-                    if n0_depth == n1_depth:
-                        fifo_depth = n0_depth
-                    elif n0_depth != n1_depth:
-                        fifo_depth = max(n0_depth, n1_depth)
-
-                    if fifo_depth > 2:
-                        # assumption: HLS streaming components already have
-                        # depth-2 FIFOs on inputs and outputs, so no point
-                        # creating additional small FIFOs in between --
-                        # we only create the larger FIFOs specified
-                        # create fifo node
-                        fifo_output_tensor = oh.make_tensor_value_info(
-                            model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
-                            n0.get_normal_output_shape(),
+                for n_output in n.output:
+                    consumers = model.find_consumers(n_output)
+                    if consumers is None:
+                        continue
+                    if len(consumers) > 1:
+                        warnings.warn(
+                            n.name
+                            + ": HLS node with fan-out higher than 1 cannot be stitched"
                         )
-                        graph.value_info.append(fifo_output_tensor)
-                        model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                        fifo_node = oh.make_node(
-                            "StreamingFIFO",
-                            [n_output],
-                            [fifo_output_tensor.name],
-                            domain="finn",
-                            backend="fpgadataflow",
-                            depth=fifo_depth,
-                            folded_shape=fld_shape,
-                            dataType=str(dtype.name),
-                        )
-                        # insert fifo
-                        graph.node.insert(node_ind + 1, fifo_node)
-                        # set fifo output tensor as new input tensor of second node
-                        consumer.input[0] = fifo_output_tensor.name
-                        # ensure created FIFO depth is reflected on both sides
-                        n0.set_nodeattr("outFIFODepth", fifo_depth)
-                        n1.set_nodeattr("inFIFODepth", fifo_depth)
-                        graph_modified = True
+                    consumer = consumers[0]
+                    if _suitable_node(consumer) is True:
+                        n0 = getCustomOp(n)
+                        # determine fifo node attributes
+                        fld_shape = n0.get_folded_output_shape()
+                        dtype = n0.get_output_datatype()
+
+                        # check if folded_shape of output of first node and
+                        # input of the second node is equal
+                        n1 = getCustomOp(consumer)
+                        for idx, inp in enumerate(consumer.input):
+                            if inp == n_output:
+                                if idx == 0:
+                                    fld_shape_2 = n1.get_folded_input_shape()
+                                else:
+                                    fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                        assert _suitable_folded_shapes(
+                            fld_shape, fld_shape_2
+                        ), """The
+                        folded output shape of the first node is not the same as the
+                        folded output shape of the second node. A streaming fifo can't
+                        be implemented in between these nodes."""
+
+                        # check if outFIFOdepth attribute of first node
+                        # and inFIFOdepth attribute of consumer node is equal
+                        n0_depth = n0.get_nodeattr("outFIFODepth")
+                        n1_depth = n1.get_nodeattr("inFIFODepth")
+                        if n0_depth == n1_depth:
+                            fifo_depth = n0_depth
+                        elif n0_depth != n1_depth:
+                            fifo_depth = max(n0_depth, n1_depth)
+
+                        if fifo_depth > 2:
+                            # assumption: HLS streaming components already have
+                            # depth-2 FIFOs on inputs and outputs, so no point
+                            # creating additional small FIFOs in between --
+                            # we only create the larger FIFOs specified
+                            # create fifo node
+                            fifo_output_tensor = oh.make_tensor_value_info(
+                                model.make_new_valueinfo_name(),
+                                TensorProto.FLOAT,
+                                n0.get_normal_output_shape(),
+                            )
+                            graph.value_info.append(fifo_output_tensor)
+                            model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                            fifo_node = oh.make_node(
+                                "StreamingFIFO",
+                                [n_output],
+                                [fifo_output_tensor.name],
+                                domain="finn",
+                                backend="fpgadataflow",
+                                depth=fifo_depth,
+                                folded_shape=fld_shape,
+                                dataType=str(dtype.name),
+                            )
+                            # insert fifo
+                            graph.node.insert(node_ind + 1, fifo_node)
+                            # set fifo output tensor as new input tensor of second node
+                            for idx, inp in enumerate(consumer.input):
+                                if inp == n_output:
+                                    consumer.input[idx] = fifo_output_tensor.name
+                            # ensure created FIFO depth is reflected on both sides
+                            n0.set_nodeattr("outFIFODepth", fifo_depth)
+                            n1.set_nodeattr("inFIFODepth", fifo_depth)
+                            graph_modified = True
 
         if graph_modified is False:
             # insert FIFO as first node, except when first node is DMA
@@ -131,6 +147,10 @@ class InsertFIFO(Transformation):
                 dtype = n0.get_input_datatype()
                 fifo_depth = n0.get_nodeattr("inFIFODepth")
 
+                if fifo_depth <= 2:
+                    warnings.warn("Overriding input FIFO depth to 32")
+                    fifo_depth = 32
+
                 # create fifo node
                 fifo_output_tensor = oh.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
@@ -173,6 +193,10 @@ class InsertFIFO(Transformation):
                 dtype = n0.get_output_datatype()
                 fifo_depth = n0.get_nodeattr("outFIFODepth")
 
+                if fifo_depth <= 2:
+                    warnings.warn("Overriding output FIFO depth to 32")
+                    fifo_depth = 32
+
                 # create fifo node
                 fifo_input_tensor = oh.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
@@ -180,7 +204,7 @@ class InsertFIFO(Transformation):
                     n0.get_normal_output_shape(),
                 )
                 graph.value_info.append(fifo_input_tensor)
-                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+                model.set_tensor_datatype(fifo_input_tensor.name, dtype)
 
                 fifo_node = oh.make_node(
                     "StreamingFIFO",
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
new file mode 100644
index 0000000000000000000000000000000000000000..713148d7fcdfea4411554b6d3b817a14b33a53c6
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import warnings
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.base import Transformation
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames
+from finn.core.rtlsim_exec import (
+    _reset_rtlsim,
+    _toggle_clk,
+)
+from finn.util.fpgadataflow import pyverilate_stitched_ip, is_fpgadataflow_node
+
+
+def reset_implementation(node):
+    node.set_nodeattr("code_gen_dir_ipgen", "")
+    node.set_nodeattr("ipgen_path", "")
+    node.set_nodeattr("ip_path", "")
+
+
+def set_signal(sim, keyw, value):
+    for i in range(len(sim.inputs)):
+        input_name = sim.inputs[i][0]
+        if keyw in input_name:
+            sim.io[input_name] = value
+
+
+def get_signal(sim, keyw):
+    for i in range(len(sim.outputs)):
+        output_name = sim.outputs[i][0]
+        if keyw in output_name:
+            return sim.io[output_name]
+
+
+def optimize_depth(depth):
+    if depth <= 2:
+        return 2
+    if depth <= 32:
+        # Q_srl FIFOs do not benefit from size < 32
+        # add some slack
+        return 32
+    # round to nearest power of two for Vivado IP FIFO implementation
+    return int(2 ** math.ceil(math.log2(depth)))
+
+
+class RemoveShallowFIFOs(Transformation):
+    """Remove small FIFOs as the streaming components have depth-2 FIFOs on the
+    input/outputs by default."""
+
+    # TODO add unit test
+
+    def __init__(self, shallow_threshold=2):
+        self.shallow_threshold = shallow_threshold
+
+    def apply(self, model):
+        shallow_fifos = []
+        for node in model.graph.node:
+            if (
+                node.op_type == "StreamingFIFO"
+                and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold
+            ):
+                # bypass shallow fifos
+                shallow_fifos.append(node)
+                consumers = model.find_consumers(node.output[0])
+                if consumers is None:
+                    producer = model.find_producer(node.input[0])
+                    for idx, inp in enumerate(producer.output):
+                        if inp == node.input[0]:
+                            producer.output[idx] = node.output[0]
+                else:
+                    assert len(consumers) == 1, "Fanout detected from FIFO output"
+                    consumer = consumers[0]
+                    # set fifo input tensor as new input tensor of second node
+                    for idx, inp in enumerate(consumer.input):
+                        if inp == node.output[0]:
+                            consumer.input[idx] = node.input[0]
+        # now filter out
+        for node_to_remove in shallow_fifos:
+            model.graph.node.remove(node_to_remove)
+
+        return (model, False)
+
+
+class CapConvolutionFIFODepths(Transformation):
+    """Make the size of FIFOs for convolution layers smaller where possible.
+    Will be automatically called from InsertAndSetFIFODepths if the appropriate
+    constructor flag is set.
+
+    Constructor arguments:
+    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
+                       Verilog FIFOs (Q_srl.v)
+
+    Assumed input graph properties:
+    - all nodes are fpgadataflow nodes
+    - FIFOs inserted with InsertAndSetFIFODepths
+
+    Output:
+    - graph with smaller-depth FIFOs for convolutions
+
+    Background:
+    The simulation-based rtlsim_exec tends to overestimate the required depth
+    of FIFOs between the ConvolutionInputGenerator (here called SWG) and the
+    StreamingFCLayer (here called MVAU). As the SWG has an internal buffer of 1
+    image row, we use this as a rule of thumb to set FIFO depth to be no larger
+    than 1 row.
+    """
+
+    # TODO add unit test
+
+    def __init__(self, max_qsrl_depth=256):
+        super().__init__()
+        self.max_qsrl_depth = max_qsrl_depth
+
+    def apply(self, model):
+        # TODO move this to own transformation
+        for node in model.graph.node:
+            # look for following pattern:
+            # ConvolutionInputGenerator -> StreamingFIFO -> StreamingFCLayer
+            if node.op_type == "StreamingFIFO":
+                fifo_prod = model.find_producer(node.input[0])
+                fifo_cons = model.find_consumer(node.output[0])
+                if fifo_prod is None:
+                    continue
+                if fifo_prod.op_type != "ConvolutionInputGenerator":
+                    continue
+                if fifo_cons is None:
+                    continue
+                if fifo_cons.op_type != "StreamingFCLayer_Batch":
+                    continue
+                op_inst = getCustomOp(node)
+                depth = op_inst.get_nodeattr("depth")
+                # SWG has an internal buffer of 1 row, so we use this as a
+                # rule of thumb to set FIFO depth to be no larger than 1 row
+                (bs, h, w, ifold, simd) = op_inst.get_folded_input_shape()
+                new_depth = optimize_depth(w * ifold)
+                new_depth = min(new_depth, depth)
+                op_inst.set_nodeattr("depth", new_depth)
+                # Set FIFO implementation/ram styles
+                if new_depth > self.max_qsrl_depth:
+                    op_inst.set_nodeattr("impl_style", "vivado")
+                    op_inst.set_nodeattr("ram_style", "auto")
+                else:
+                    op_inst.set_nodeattr("impl_style", "rtl")
+
+        return (model, False)
+
+
+class InsertAndSetFIFODepths(Transformation):
+    """Insert appropriate-depth StreamingFIFOs through RTLSim that preserve
+    throughput in the created accelerator.
+
+    Constructor arguments:
+    - clk_ns : clock period (used for IP preparation)
+    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
+                       Verilog FIFOs (Q_srl.v)
+    - max_depth : how deep the "max"-sized FIFOs initially inserted will be
+    - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
+                        smaller where appropriate
+
+    Assumed input graph properties:
+    - all nodes are fpgadataflow nodes
+    - no FIFOs inserted,
+    - (inFIFODepth/outFIFODepth attrs will be ignored)
+
+    Output:
+    - graph with appropriate-depth FIFOs inserted
+
+    Background:
+    Even with all FINN HLS fpgadatflow layers appropriately parallelized, it is
+    necessary to insert FIFOs between them to prevent stalls due to bursty
+    behavior. The sizes of those FIFOs are hard to predict analytically, so
+    we do the following:
+    - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes
+    - create stitched design
+    - run through rtlsim with stream of multiple random input images (to fill pipeline)
+    - keep track of observed maximum occupancy for each FIFO during rtlsim
+    - when sim finished, update each FIFO depth to maximum observed occupancy
+      and set inFIFODepth/outFIFODepth attrs to 0 on relevant nodes
+    """
+
+    def __init__(
+        self,
+        fpgapart,
+        clk_ns=10.0,
+        max_qsrl_depth=256,
+        max_depth=2 ** 14,
+        swg_exception=True,
+    ):
+        super().__init__()
+        self.fpgapart = fpgapart
+        self.clk_ns = clk_ns
+        self.max_qsrl_depth = max_qsrl_depth
+        self.max_depth = max_depth
+        self.swg_exception = swg_exception
+
+    def apply(self, model):
+        # change external to decoupled and warn user
+        # this way we are sure we have exactly one input/output
+        modified_fc_nodes = []
+        for node in model.graph.node:
+            # verify assumptions
+            assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str(
+                node
+            )
+            assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
+            node = getCustomOp(node)
+            node.set_nodeattr("inFIFODepth", self.max_depth)
+            node.set_nodeattr("outFIFODepth", self.max_depth)
+            if node.onnx_node.op_type == "StreamingFCLayer_Batch":
+                mmode = node.get_nodeattr("mem_mode")
+                if mmode == "external":
+                    modified_fc_nodes.append(node.onnx_node.name)
+                    node.set_nodeattr("mem_mode", "decoupled")
+                    reset_implementation(node)
+                    warnings.warn(
+                        "Changed mem_mode from external to decoupled for "
+                        + node.onnx_node.name
+                    )
+
+        # insert stream infrastructure (DWC/FIFO)
+        model = model.transform(InsertDWC())
+        model = model.transform(InsertFIFO())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+
+        # gather FIFO names, check they are of expected depth
+        fifos = {}
+        for node in model.graph.node:
+            if node.op_type == "StreamingFIFO":
+                fifos[node.name] = 0
+                node = getCustomOp(node)
+                # check depths and fix as necessary
+                if node.get_nodeattr("depth") != self.max_depth:
+                    node.set_nodeattr("depth", self.max_depth)
+
+        # insert FIFOs and do all transformations for RTLsim
+        model = model.transform(AnnotateCycles())
+        perf = model.analysis(dataflow_performance)
+        latency = perf["critical_path_cycles"]
+        max_cycles = perf["max_cycles"]
+        model = model.transform(PrepareIP(self.fpgapart, self.clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
+        model.set_metadata_prop("exec_mode", "rtlsim")
+
+        # calculate input frequency (number of cycles for each input word)
+        first_node = getCustomOp(model.graph.node[0])
+        ncycles_per_input = max(
+            1,
+            int(
+                math.ceil(
+                    perf["max_cycles"]
+                    / (
+                        np.prod(first_node.get_folded_input_shape())
+                        / first_node.get_folded_input_shape()[-1]
+                    )
+                )
+            ),
+        )
+
+        # set sufficiently large threshold for 1 image to  fully execute and exit
+        ncycles = int(latency + max_cycles)
+
+        # prepare pyverilator model
+        sim = pyverilate_stitched_ip(model)
+
+        _reset_rtlsim(sim)
+        _toggle_clk(sim)
+
+        # set all input valids to 0 and output readies to 1
+        # set input data to some constant
+        set_signal(sim, "tvalid", 0)
+        set_signal(sim, "tready", 1)
+        set_signal(sim, "tdata", 0)
+
+        output_detected = False
+        while ncycles > 0:
+            _toggle_clk(sim)
+            # set/unset valids
+            if ncycles % ncycles_per_input == 0:
+                set_signal(sim, "tvalid", 1)
+            else:
+                set_signal(sim, "tvalid", 0)
+
+            # check/update all fifo counts
+            for key in fifos:
+                current_state = sim.internals["finn_design_i"][key]["inst"][
+                    key + "_" + key
+                ]["state"]
+                current_addr = sim.internals["finn_design_i"][key]["inst"][
+                    key + "_" + key
+                ]["addr"]
+                if current_state == 2:
+                    current_count = current_addr + 2
+                else:
+                    current_count = current_state
+                if current_count > fifos[key]:
+                    fifos[key] = current_count
+
+            # since latency estimation is very pessimistic, detect first output
+            # and fast-forward the sim
+            if get_signal(sim, "tvalid") != 0 and not output_detected:
+                ncycles = max_cycles
+                output_detected = True
+            else:
+                ncycles = ncycles - 1
+
+        if not output_detected:
+            warnings.warn(
+                "No output detected, calculated FIFO depths may not be correct"
+            )
+
+        # Apply depths back into the model;
+        # also set in/outFIFODepth to zero for non-FIFO
+        # nodes, preventing further FIFO insertion
+        for node in model.graph.node:
+            # set FIFO depth, reset FIFO implementation,
+            # and set implementation/ram styles
+            if node.op_type == "StreamingFIFO":
+                assert node.name in fifos, "FIFO node not found in size dictionary"
+                # set depth of FIFO
+                depth = optimize_depth(fifos[node.name])
+                node_inst = getCustomOp(node)
+                node_inst.set_nodeattr("depth", depth)
+                # Set FIFO implementation/ram styles
+                if depth > self.max_qsrl_depth:
+                    node_inst.set_nodeattr("impl_style", "vivado")
+                    node_inst.set_nodeattr("ram_style", "auto")
+                else:
+                    node_inst.set_nodeattr("impl_style", "rtl")
+                # reset implementation
+                reset_implementation(node_inst)
+                del fifos[node.name]
+            else:
+                getCustomOp(node).set_nodeattr("inFIFODepth", 0)
+                getCustomOp(node).set_nodeattr("outFIFODepth", 0)
+                # for every FC node we changed from external to decoupled,
+                # change back and reset implementation
+                if node.op_type == "StreamingFCLayer_Batch":
+                    if node.name in modified_fc_nodes:
+                        node_inst = getCustomOp(node)
+                        node_inst.set_nodeattr("mem_mode", "external")
+                        reset_implementation(node_inst)
+                        modified_fc_nodes.remove(node.name)
+
+        assert (
+            len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0
+        ), "FIFO/FC nodes left untouched after model reconfiguration"
+
+        # handle custom sizing for SWG FIFOs if desired
+        if self.swg_exception:
+            model = model.transform(
+                CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth)
+            )
+        # remove shallow FIFOs
+        model = model.transform(RemoveShallowFIFOs())
+
+        return (model, False)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 4eed1a260974e4f842e9e93756caff135c5fbdde..7a428b8592e0e67dd8561f1425482a006a79479a 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -82,8 +82,8 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
-from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.modelwrapper import ModelWrapper
 from scipy.stats import linregress
@@ -128,19 +128,17 @@ def update_dashboard_data(topology, wbits, abits, key, val):
 
 def fold_tfc(model):
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
-    # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
+    # (PE, SIMD, ramstyle) for each layer
     config = [
-        (16, 49, 16, 64, "block"),
-        (8, 8, 64, 64, "auto"),
-        (8, 8, 64, 64, "auto"),
-        (10, 8, 64, 10, "distributed"),
+        (16, 49, "block"),
+        (8, 8, "auto"),
+        (8, 8, "auto"),
+        (10, 8, "distributed"),
     ]
-    for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):
+    for fcl, (pe, simd, ramstyle) in zip(fc_layers, config):
         fcl_inst = getCustomOp(fcl)
         fcl_inst.set_nodeattr("PE", pe)
         fcl_inst.set_nodeattr("SIMD", simd)
-        fcl_inst.set_nodeattr("inFIFODepth", ififo)
-        fcl_inst.set_nodeattr("outFIFODepth", ofifo)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
     # set parallelism for input quantizer to be same as first layer's SIMD
     inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
@@ -151,62 +149,56 @@ def fold_tfc(model):
 
 def fold_cnv_large(model):
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
-    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
+    # each tuple is (PE, SIMD) for a layer
     folding = [
-        (16, 3, 256),
-        (32, 32, 256),
-        (16, 32, 256),
-        (16, 32, 256),
-        (4, 32, 214),
-        (1, 32, 2),
-        (1, 4, 126),
-        (1, 8, 62),
-        (5, 1, 6),
+        (16, 3),
+        (32, 32),
+        (16, 32),
+        (16, 32),
+        (4, 32),
+        (1, 32),
+        (1, 4),
+        (1, 8),
+        (5, 1),
     ]
-    for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
+    for fcl, (pe, simd) in zip(fc_layers, folding):
         fcl_inst = getCustomOp(fcl)
         fcl_inst.set_nodeattr("PE", pe)
         fcl_inst.set_nodeattr("SIMD", simd)
-        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
 
     swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
-    swg_idepth = [2, 51, 9, 106, 2, 2]
     for i in range(len(swg_layers)):
         swg_inst = getCustomOp(swg_layers[i])
         simd = folding[i][1]
         swg_inst.set_nodeattr("SIMD", simd)
-        swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])
     return model
 
 
 def fold_cnv_small(model):
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
-    # each tuple is (PE, SIMD, in_fifo_depth) for a layer
+    # each tuple is (PE, SIMD) for a layer
     folding = [
-        (8, 3, 256, "auto"),
-        (16, 16, 256, "auto"),
-        (8, 16, 256, "auto"),
-        (8, 16, 256, "block"),
-        (4, 8, 214, "auto"),
-        (1, 8, 2, "auto"),
-        (1, 2, 126, "distributed"),
-        (2, 2, 62, "block"),
-        (5, 1, 6, "distributed"),
+        (8, 3, "auto"),
+        (16, 16, "auto"),
+        (8, 16, "auto"),
+        (8, 16, "block"),
+        (4, 8, "auto"),
+        (1, 8, "auto"),
+        (1, 2, "distributed"),
+        (2, 2, "block"),
+        (5, 1, "distributed"),
     ]
-    for fcl, (pe, simd, ififodepth, ramstyle) in zip(fc_layers, folding):
+    for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding):
         fcl_inst = getCustomOp(fcl)
         fcl_inst.set_nodeattr("PE", pe)
         fcl_inst.set_nodeattr("SIMD", simd)
-        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
 
     swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
-    swg_idepth = [2, 51, 9, 106, 2, 2]
     for i in range(len(swg_layers)):
         swg_inst = getCustomOp(swg_layers[i])
         simd = folding[i][1]
         swg_inst.set_nodeattr("SIMD", simd)
-        swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])
     return model
 
 
@@ -446,19 +438,41 @@ class TestEnd2End:
         model = model.transform(HLSSynthIP())
         model.save(get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind))
 
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
+    def test_set_fifo_depths(self, topology, wbits, abits, kind):
+        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
+        model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
+        model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
+        fifo_layers = model.get_nodes_by_op_type("StreamingFIFO")
+        assert len(fifo_layers) > 0
+        hls_layers = model.get_finn_nodes()
+        for node in hls_layers:
+            if node.op_type != "StreamingFIFO":
+                op_inst = getCustomOp(node)
+                assert op_inst.get_nodeattr("inFIFODepth") == 0
+                assert op_inst.get_nodeattr("outFIFODepth") == 0
+        model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + kind))
+
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq"])
     def test_ipstitch_rtlsim(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, "fifodepth_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
         model = model.transform(InsertDWC())
-        model = model.transform(InsertFIFO())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(AnnotateCycles())
         perf = model.analysis(dataflow_performance)
         latency = perf["critical_path_cycles"]
+        # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that
+        for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO"):
+            getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl")
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
@@ -533,7 +547,9 @@ class TestEnd2End:
     def test_build(self, topology, wbits, abits, kind):
         if kind == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, "fifodepth_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         cfg = get_build_env(kind, target_clk_ns)
         model = model.transform(cfg["build_fxn"])