diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index fcd6f9d788d0af1cad6de5259e5e181e76ac96bc..7e13e117859365531f459928b7c664edb3fbf4ce 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -16,7 +16,7 @@ gecho () {
 BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716
-PYVERILATOR_COMMIT=1d89cb0d4e0c97469cc6352c611f876ec13edfa6
+PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 
 
diff --git a/requirements.txt b/requirements.txt
index 6b8e4d02c8ca1dcdbe607aabdccd27cec8056332..b15d86ed89f7b0e76b772ce42aba6481937310b0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ pyverilator
 scipy
 sphinx
 toposort
+vcdvcd
 wget
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index e5e6d29bd8d8ed23f6a4958856ed1ddea3617175..ad44dab578b396c80af35af2ede031baca798150 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -86,9 +86,7 @@ def rtlsim_exec(model, execution_context):
         sim = pyverilate_stitched_ip(model)
         model.set_metadata_prop("rtlsim_so", sim.lib._name)
     else:
-        sim = PyVerilator(rtlsim_so)
-    _reset_rtlsim(sim)
-    _toggle_clk(sim)
+        sim = PyVerilator(rtlsim_so, auto_eval=False)
     ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file)
     packed_output = ret[0]
     model.set_metadata_prop("sim_cycles", str(ret[1]))
@@ -104,18 +102,22 @@ def _reset_rtlsim(sim):
     """Sets reset input in pyverilator to zero, toggles the clock and set it
     back to one"""
     sim.io.ap_rst_n_0 = 0
-    sim.io.ap_clk_0 = 1
-    sim.io.ap_clk_0 = 0
+    _toggle_clk(sim)
+    _toggle_clk(sim)
     sim.io.ap_rst_n_0 = 1
+    _toggle_clk(sim)
+    _toggle_clk(sim)
 
 
 def _toggle_clk(sim):
     """Toggles the clock input in pyverilator once."""
-    sim.io.ap_clk_0 = 1
     sim.io.ap_clk_0 = 0
+    sim.eval()
+    sim.io.ap_clk_0 = 1
+    sim.eval()
 
 
-def _run_rtlsim(sim, inp, num_out_values, trace_file=None):
+def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
     """Runs the pyverilator simulation by passing the input values to the simulation,
     toggle the clock and observing the execution time. Argument num_out_values contains
     the number of expected output values, so the simulation is closed after all
@@ -140,6 +142,8 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None):
 
     if trace_file is not None:
         sim.start_vcd_trace(trace_file)
+    if reset:
+        _reset_rtlsim(sim)
 
     while not (output_observed):
         sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0
@@ -148,8 +152,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None):
             inputs = inputs[1:]
         if sim.io.out_r_0_tvalid == 1 and sim.io.out_r_0_tready == 1:
             outputs = outputs + [sim.io.out_r_0_tdata]
-        sim.io.ap_clk_0 = 1
-        sim.io.ap_clk_0 = 0
+        _toggle_clk(sim)
 
         observation_count = observation_count + 1
         no_change_count = no_change_count + 1
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 66190333ce8d71dafba99aaeae4fb2c973d67410..1f734b548f923341687843c538d1887fcc069bee 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -110,6 +110,8 @@ class StreamingFIFO(HLSCustomOp):
         ]
         # make instream width a multiple of 8 for axi interface
         in_width = self.get_instream_width_padded()
+        count_width = int(self.get_nodeattr("depth") - 1).bit_length()
+        self.code_gen_dict["$COUNT_RANGE$"] = ["[{}:0]".format(count_width - 1)]
         self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
         self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)]
         self.code_gen_dict["$WIDTH$"] = [str(in_width)]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 5f526aa2aa1917144c7a048c9d9314aa9288a2d8..1a8216f64bf71b7fb9f1f8becf4732970b5bf451 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -408,6 +408,7 @@ strm_fifo_wrapper = """
 module $TOPNAME$(
 ap_clk,
 ap_rst_n,
+count,
 in0_V_V_TDATA,
 in0_V_V_TVALID,
 in0_V_V_TREADY,
@@ -418,6 +419,7 @@ out_V_V_TREADY
 
 input   ap_clk;
 input   ap_rst_n;
+output $COUNT_RANGE$ count;
 input  $IN_RANGE$ in0_V_V_TDATA;
 input   in0_V_V_TVALID;
 output   in0_V_V_TREADY;
@@ -433,6 +435,7 @@ $LAYER_NAME$
 (
  .clock(ap_clk),
  .reset(!ap_rst_n),
+ .count(count),
  .i_d(in0_V_V_TDATA),
  .i_v(in0_V_V_TVALID),
  .i_r(in0_V_V_TREADY),
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 3880bb9591e27af5fe9d063dba2485d304e4db54..d3bfb73fe239d7194fab3760555663895a209e84 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -56,6 +56,12 @@ def get_rtlsim_trace_depth():
     via the RTLSIM_TRACE_DEPTH environment variable. If the env.var. is
     undefined, the default value of 1 is returned. A trace depth of 1
     will only show top-level signals and yield smaller .vcd files.
+
+    The following depth values are of interest for whole-network stitched IP
+    rtlsim:
+    - level 1 shows top-level input/output streams
+    - level 2 shows per-layer input/output streams
+    - level 3 shows per full-layer I/O including FIFO count signals
     """
 
     try:
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index d1669444e55cb0fddb2690e51849c4603d47d32c..3fe747a84985b2702ffb1e5855d9071362efebda 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -104,6 +104,7 @@ def pyverilate_stitched_ip(model):
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
         top_module_name=top_module_name,
+        auto_eval=False,
     )
     return sim
 
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e244422065314ceb790dc6719b57688ff76828
--- /dev/null
+++ b/src/finn/util/vcd.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from vcdvcd import VCDVCD
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
+
+# string patterns to search for to find particular interfaces
+# streaming interfaces
+vname = "TVALID"
+rname = "TREADY"
+# FIFO count signals
+fifo_mod_name = "StreamingFIFO"
+fifo_cname = "count"
+
+
+def list_stream_if(vcd_file):
+    "Return a list of stream  interface names from given vcd trace."
+
+    sig_names = VCDVCD(vcd_file, print_dumps=False, only_sigs=True).get_signals()
+    stream_if_names = []
+    for cand_name in filter(lambda x: x.endswith(vname), sig_names):
+        base_name = cand_name.replace(vname, "")
+        if base_name + rname in sig_names:
+            stream_if_names.append(base_name)
+    return stream_if_names
+
+
+def list_fifo_count_signals(vcd_file):
+    "Return a list of FIFO count signal names from given vcd trace."
+
+    sig_names = VCDVCD(vcd_file, print_dumps=False, only_sigs=True).get_signals()
+    fifo_cnt_names = []
+    for cand_name in filter(lambda x: fifo_cname in x, sig_names):
+        if fifo_mod_name in cand_name:
+            fifo_cnt_names.append(cand_name)
+    return fifo_cnt_names
+
+
+def get_fifo_count_max(vcd_file, fifo_count_signal):
+    "Return the maximum value of the given FIFO count signal in vcd trace."
+
+    d = VCDVCD(vcd_file, signals=[fifo_count_signal], store_tvs=True).get_data()
+    assert len(d) != 0, "FIFO count signal not found"
+    events = list(d.values())[0]["tv"]
+    max = 0
+    for (time, val) in events:
+        current = int(val, base=2)
+        if current > max:
+            max = current
+    return max
+
+
+def _get_fifo_max(x):
+    return (x[0], get_fifo_count_max(x[1], x[0]))
+
+
+def get_all_fifo_count_max(vcd_file, fifo_count_signals=None):
+    """Return a list of max FIFO counts. If fifo_count_signals is None,
+    all FIFO count signals will be returned, otherwise treated as a list of
+    signal names to return the stats for."""
+    if fifo_count_signals is None:
+        fifo_count_signals = list_fifo_count_signals(vcd_file)
+
+    with mp.Pool(get_num_default_workers()) as p:
+        fifo_count_signals = map(lambda x: (x, vcd_file), fifo_count_signals)
+        all_stats = p.map(_get_fifo_max, fifo_count_signals)
+
+    return all_stats
+
+
+def get_stream_if_stats(vcd_file, if_base_name):
+    """Return statistics for given streaming interface in vcd trace in the
+    following dict format:
+
+    <stream_state>: (<num_samples>, <fraction_of_time>),
+
+    where <stream_state> is the combination of (V)alid/(R)eady values,
+    <num_samples> is the approximate number of rising clock edges spent in <state>
+    , and <fraction_of_time> is the fraction of <num_samples> to total
+    amount of time recorded by the trace.
+
+    Example:
+    {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
+     "{'V': 1, 'R': 0}": (0, 0.0),
+     "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
+     "{'V': 1, 'R': 1}": (640, 0.07757575757575758)}
+
+    Here we can see the stream was transmitting values 7.7% of the time,
+    and 9.2% of the time there was no incoming data (valid 0, ready 1)
+    """
+    if_valid = if_base_name + vname
+    if_ready = if_base_name + rname
+    v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True)
+    endtime = v.get_endtime()
+    v = v.get_data()
+    assert len(v) != 0, "Streaming interface not found"
+    v = list(v.values())[0]["tv"]
+    v = list(map(lambda x: ("V", x[0], x[1]), v))
+    v.append(("V", endtime, "0"))
+    r = VCDVCD(vcd_file, signals=[if_ready], store_tvs=True).get_data()
+    assert len(r) != 0, "Streaming interface not found"
+    r = list(r.values())[0]["tv"]
+    r = list(map(lambda x: ("R", x[0], x[1]), r))
+    r.append(("R", endtime, "0"))
+    events = sorted(v + r, key=lambda x: x[1])
+    ret = {
+        "{'V': 0, 'R': 0}": 0,
+        "{'V': 1, 'R': 0}": 0,
+        "{'V': 0, 'R': 1}": 0,
+        "{'V': 1, 'R': 1}": 0,
+    }
+    status = {"V": 0, "R": 0}
+    last_time = 0
+    total_rising_clock_edges = 0
+    for (sig, time, val) in events:
+        # pyverilator generates 5 time units per sample
+        time = time / 5
+        # pyverilator generates 4 samples per clock period
+        n_rising_clock_edges = int((time - last_time) / 4)
+        # note that the calculation of n_rising_clock_edges is approximate
+        # doing this exactly would require a cycle-by-cycle walkthrough of the
+        # trace, which can take very long
+        ret[str(status)] += n_rising_clock_edges
+        total_rising_clock_edges += n_rising_clock_edges
+        status[sig] = int(val)
+        last_time = time
+
+    for state in ret:
+        v = ret[state]
+        ret[state] = (v, v / total_rising_clock_edges)
+
+    return ret
+
+
+def _get_stats(x):
+    return (x[0], get_stream_if_stats(x[1], x[0]))
+
+
+def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}"):
+    """Return a list of streaming interface stats, sorted by the percentage
+    for the given sort_by key. If stream_ifs is None, all streamin interface
+    stats will be returned, otherwise treated as a list of interface names to
+    return the stats for."""
+
+    if stream_ifs is None:
+        stream_ifs = list_stream_if(vcd_file)
+
+    with mp.Pool(get_num_default_workers()) as p:
+        stream_ifs = map(lambda x: (x, vcd_file), stream_ifs)
+        all_stats = p.map(_get_stats, stream_ifs)
+
+    def sort_key(x):
+        stat = x[1]
+        (samples, percent) = stat[sort_by]
+        return percent
+
+    ret = sorted(all_stats, key=sort_key)
+    return ret
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py
index 15c1c41b006c6f87d79a0e7eb6a4458838de5fd2..13758e01e1df96a79658f5ebc7501c9fb43d0882 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -72,6 +72,7 @@ from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+import finn.util.vcd as vcd
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -197,6 +198,8 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
     # whole-network (ip-stitched) rtlsim
     model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", build_dir + "/tfc_w1a1.vcd")
+    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
     model.save(build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx")
     ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
     res_rtlsim_whole = ret_rtlsim_whole[out_name]
@@ -204,6 +207,24 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+def test_end2end_tfc_w1a1_verify_fifo_fullness():
+    vcdf = build_dir + "/tfc_w1a1.vcd"
+    if not os.path.isfile(vcdf):
+        pytest.skip("Cannot find %s, skipping" % vcdf)
+    stream_ifs = vcd.list_stream_if(vcdf)
+    fifos = vcd.list_fifo_count_signals(vcdf)
+    assert len(stream_ifs) == 37
+    assert len(fifos) == 6
+    fifo_max = vcd.get_all_fifo_count_max(vcdf)
+    assert fifo_max[0][0] == "TOP.v.finn_design_i.StreamingFIFO_0.count[3:0]"
+    assert fifo_max[0][1] == 3
+    stream_stat = vcd.get_all_stream_if_stats(vcdf)
+    assert (
+        stream_stat[0][0]
+        == "TOP.v.finn_design_i.StreamingDataWidthConverter_Batch_0_out_V_V_"
+    )
+
+
 @pytest.mark.vivado
 def test_end2end_tfc_w1a1_verify_all():
     # use the streamlined model as the "golden" model for right answers