diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index 6c619c51ceb4a99a077fc61c52ce81763cfd27f5..b4e89628a44bb1f55c3445ee8e6866beada23585 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -74,31 +74,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
 
-   `define LOG2 (  (((depth))     ==0) ? 0	/* - depth==0   LOG2=0 */ \
-		 : (((depth-1)>>0)==0) ? 0	/* - depth<=1   LOG2=0 */ \
-		 : (((depth-1)>>1)==0) ? 1	/* - depth<=2   LOG2=1 */ \
-		 : (((depth-1)>>2)==0) ? 2	/* - depth<=4   LOG2=2 */ \
-		 : (((depth-1)>>3)==0) ? 3	/* - depth<=8   LOG2=3 */ \
-		 : (((depth-1)>>4)==0) ? 4	/* - depth<=16  LOG2=4 */ \
-		 : (((depth-1)>>5)==0) ? 5	/* - depth<=32  LOG2=5 */ \
-		 : (((depth-1)>>6)==0) ? 6	/* - depth<=64  LOG2=6 */ \
-		 : (((depth-1)>>7)==0) ? 7	/* - depth<=128 LOG2=7 */ \
-		 :                       8)	/* - depth<=256 LOG2=8 */
-
-// parameter addrwidth = LOG2;			// - width of queue addr
-
-   parameter addrwidth =
-		(  (((depth))     ==0) ? 0	// - depth==0   LOG2=0
-		 : (((depth-1)>>0)==0) ? 0	// - depth<=1   LOG2=0
-		 : (((depth-1)>>1)==0) ? 1	// - depth<=2   LOG2=1
-		 : (((depth-1)>>2)==0) ? 2	// - depth<=4   LOG2=2
-		 : (((depth-1)>>3)==0) ? 3	// - depth<=8   LOG2=3
-		 : (((depth-1)>>4)==0) ? 4	// - depth<=16  LOG2=4
-		 : (((depth-1)>>5)==0) ? 5	// - depth<=32  LOG2=5
-		 : (((depth-1)>>6)==0) ? 6	// - depth<=64  LOG2=6
-		 : (((depth-1)>>7)==0) ? 7	// - depth<=128 LOG2=7
-		 :                       8)	// - depth<=256 LOG2=8
-		 ;
+   parameter addrwidth = $clog2(depth);
 
    input     clock;
    input     reset;
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
new file mode 100644
index 0000000000000000000000000000000000000000..798bbd335f0028d1103d992fd2b8b9cd30bbb6e1
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import copy
+import math
+import numpy as np
+import warnings
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.core.rtlsim_exec import (
+    _reset_rtlsim,
+    _toggle_clk,
+)
+from finn.util.fpgadataflow import (
+    pyverilate_stitched_ip,
+)
+
+
+def set_signal(sim, keyw, value):
+    for i in range(len(sim.inputs)):
+        input_name = sim.inputs[i][0]
+        if keyw in input_name:
+            sim.io[input_name] = value
+
+
+def optimize_depth(depth):
+    if depth <= 2:
+        return 2
+    if depth <= 32:
+        return 32
+    if depth <= 1024:
+        return int(2 ** math.ceil(math.log2(depth)))
+    return int(math.ceil(depth / 1024))
+
+
+class SetFIFODepths(Transformation):
+    """Determines minimum depths of StreamingFIFOs through RTLSim.
+    We assume we get a dataflow partition (all nodes are dataflow, no FIFOs)
+    We set initial depths very high (16k), run sim with multiple
+    images on input (random/constant data) and keep track of maximum
+    occupancy counts in each FIFO."""
+
+    def __init__(self, fpgapart, clk_ns=10.0):
+        super().__init__()
+        self.fpgapart = fpgapart
+        self.clk_ns = clk_ns
+
+    def apply(self, model):
+
+        orig_model = model
+
+        # work on a copy of the model
+        model = copy.deepcopy(model)
+
+        # change external to decoupled and warn user;
+        # this way we are sure we have exactly one input/output
+        for node in model.graph.node:
+            node = getCustomOp(node)
+            node.set_nodeattr("inFIFODepth", 2 ** 14)
+            node.set_nodeattr("outFIFODepth", 2 ** 14)
+            if node.onnx_node.op_type == "StreamingFCLayer_Batch":
+                mmode = node.get_nodeattr("mem_mode")
+                if mmode == "external":
+                    node.set_nodeattr("mem_mode", "decoupled")
+                    warnings.warn(
+                        "Changed mem_mode from external to decoupled for "
+                        + node.onnx_node.name
+                    )
+
+        # insert stream infrastructure (DWC/FIFO)
+        model = model.transform(InsertDWC())
+        model = model.transform(InsertFIFO())
+        model = model.transform(GiveUniqueNodeNames())
+
+        # gather FIFO names, check they are of expected depth
+        fifos = {}
+        for node in model.graph.node:
+            if node.op_type == "StreamingFIFO":
+                consumer = model.find_consumers(node.output[0])
+                if consumer is not None:
+                    consumer = consumer[0].name
+                producer = model.find_producer(node.input[0])
+                if producer is not None:
+                    producer = producer.name
+                fifos[node.name] = {
+                    "depth": 0,
+                    "consumer": consumer,
+                    "producer": producer,
+                }
+                node = getCustomOp(node)
+                # check depths
+                # if model came in with FIFOs, the depths will not have been updated
+                if node.get_nodeattr("depth") != 2 ** 14:
+                    node.set_nodeattr("depth", 2 ** 14)
+
+        # insert FIFOs and do all transformations for RTLsim
+        model = model.transform(AnnotateCycles())
+        perf = model.analysis(dataflow_performance)
+        latency = perf["critical_path_cycles"]
+        max_cycles = perf["max_cycles"]
+        model = model.transform(PrepareIP(self.fpgapart, self.clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
+        model.set_metadata_prop("exec_mode", "rtlsim")
+
+        # calculate input frequency (number of cycles for each input word)
+        first_node = getCustomOp(model.graph.node[0])
+        ncycles_per_input = math.ceil(
+            perf["max_cycles"]
+            / (
+                np.prod(first_node.get_folded_input_shape())
+                / first_node.get_folded_input_shape()[-1]
+            )
+        )
+
+        # set sufficiently large threshold for 1 image to  fully execute and exit
+        ncycles = int(latency + max_cycles)
+
+        # prepare pyverilator model
+        sim = pyverilate_stitched_ip(model)
+
+        _reset_rtlsim(sim)
+        _toggle_clk(sim)
+
+        # set all input valids to 0 and output readies to 1
+        # set input data to some constant
+        set_signal(sim, "tvalid", 0)
+        set_signal(sim, "tready", 1)
+        set_signal(sim, "tdata", 0)
+
+        while ncycles > 0:
+            _toggle_clk(sim)
+            # set/unset valids
+            if ncycles % ncycles_per_input == 0:
+                set_signal(sim, "tvalid", 1)
+            else:
+                set_signal(sim, "tvalid", 0)
+
+            # check/update all fifo counts
+            for key in fifos:
+                current_state = sim.internals["finn_design_i"][key]["inst"][
+                    key + "_" + key
+                ]["state"]
+                current_addr = sim.internals["finn_design_i"][key]["inst"][
+                    key + "_" + key
+                ]["addr"]
+                if current_state == 2:
+                    current_count = current_addr + 2
+                else:
+                    current_count = current_state
+                if current_count > fifos[key]["depth"]:
+                    fifos[key]["depth"] = current_count
+            ncycles = ncycles - 1
+
+        # for each node in the original graph, determine in/outFIFODepth
+        ret = {}
+        for key in fifos:
+            predecessor_node = fifos[key]["producer"]
+            if predecessor_node is not None:
+                if predecessor_node not in ret:
+                    ret[predecessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0}
+                out_depth = ret[predecessor_node]["outFIFODepth"]
+                ret[predecessor_node]["outFIFODepth"] = max(
+                    out_depth, fifos[key]["depth"]
+                )
+
+            succcessor_node = fifos[key]["consumer"]
+            if succcessor_node is not None:
+                if succcessor_node not in ret:
+                    ret[succcessor_node] = {"inFIFODepth": 0, "outFIFODepth": 0}
+                in_depth = ret[succcessor_node]["inFIFODepth"]
+                ret[succcessor_node]["inFIFODepth"] = max(in_depth, fifos[key]["depth"])
+
+        # tweak and apply depths to original model
+        for node in orig_model.graph.node:
+            if node.name in ret:
+                depths = ret[node.name]
+                node = getCustomOp(node)
+                node.set_nodeattr("inFIFODepth", optimize_depth(depths["inFIFODepth"]))
+                node.set_nodeattr(
+                    "outFIFODepth", optimize_depth(depths["outFIFODepth"])
+                )
+
+        return (orig_model, False)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 29ecb2c7e49444cecade6d3321aaba3b9add4b9c..890d0db30afaf795f6b4ae439b3989d1f44beb67 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -80,6 +80,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.transformation.fpgadataflow.set_fifo_depths import SetFIFODepths
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.modelwrapper import ModelWrapper
 from scipy.stats import linregress
@@ -305,10 +306,22 @@ class TestEnd2End:
     @pytest.mark.slow
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
-    def test_ipstitch_rtlsim(self, topology, wbits, abits, kind):
+    def test_set_fifo_depths(self, topology, wbits, abits, kind):
         prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
+        model = model.transform(SetFIFODepths(test_fpga_part, target_clk_ns))
+        model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + kind))
+
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.parametrize("kind", ["zynq", "alveo"])
+    def test_ipstitch_rtlsim(self, topology, wbits, abits, kind):
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, "fifodepth_" + kind
+        )
+        model = load_test_checkpoint_or_skip(prev_chkpt_name)
+        test_fpga_part = get_build_env(kind, target_clk_ns)["part"]
         model = model.transform(InsertDWC())
         model = model.transform(InsertFIFO())
         model = model.transform(GiveUniqueNodeNames())
@@ -326,7 +339,9 @@ class TestEnd2End:
                 "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits)
             )
             os.environ["RTLSIM_TRACE_DEPTH"] = "3"
-        rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind)
+        rtlsim_chkpt = get_checkpoint_name(
+            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+        )
         model.save(rtlsim_chkpt)
         parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent")
         (input_tensor_npy, output_tensor_npy) = get_golden_io_pair(
@@ -342,7 +357,9 @@ class TestEnd2End:
     @pytest.mark.vivado
     @pytest.mark.parametrize("kind", ["zynq", "alveo"])
     def test_throughput_rtlsim(self, topology, wbits, abits, kind):
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + kind)
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, "ipstitch_rtlsim_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         n_nodes = len(model.graph.node)
         perf_est = model.analysis(dataflow_performance)
@@ -361,7 +378,9 @@ class TestEnd2End:
     def test_build(self, topology, wbits, abits, kind):
         if kind == "alveo" and ("VITIS_PATH" not in os.environ):
             pytest.skip("VITIS_PATH not set")
-        prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + kind)
+        prev_chkpt_name = get_checkpoint_name(
+            topology, wbits, abits, "fifodepth_" + kind
+        )
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         cfg = get_build_env(kind, target_clk_ns)
         model = model.transform(cfg["build_fxn"])