diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 4374111f22a12e586c5c5233a7eee096b848b86e..00c25a4a3150a8368405b449fdce04456ccbe88d 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -1,17 +1,18 @@
 name: DockerImage
 
 on:
+  pull_request:
+    branches: [ dev ]
   push:
-    branches:
-      - 'dev'
+    branches: [ dev ]
 
 jobs:
   docker:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
       -
         name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v1
diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index ec92c84665d868b8a4376c82ecdf72395f1367a8..e2ba47ec296f73cfd7c0eede98bac3acd066075a 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -11,11 +11,11 @@ jobs:
 
   test:
     name: Run quicktest on PR branch
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
 
     steps:
       - name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: DockerRunQuicktest
         run: |
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index b3c669ec1097745bd30f650ca0b9dacda647c61d..dbafba247679895bcbaf385f0d33946c3f810945 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -84,7 +84,7 @@ RUN rm requirements.txt
 # extra Python package dependencies (for testing and interaction)
 RUN pip install pygments==2.4.1
 RUN pip install ipykernel==5.5.5
-RUN pip install jupyter==1.0.0
+RUN pip install jupyter==1.0.0 --ignore-installed
 RUN pip install markupsafe==2.0.1
 RUN pip install matplotlib==3.3.1 --ignore-installed
 RUN pip install pytest-dependency==0.5.1
diff --git a/docs/finn/source_code/finn.analysis.fpgadataflow.rst b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
index b52e994ee6033d4c3c1aae6400e20e103455d7b6..57472cb670b6fa6cb95e6c137458d3a522f82f5a 100644
--- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
@@ -30,6 +30,7 @@ finn.analysis.fpgadataflow.floorplan\_params
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
 -------------------------------------------------------------
 
@@ -38,14 +39,15 @@ finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
    :undoc-members:
    :show-inheritance:
 
- finn.analysis.fpgadataflow.op\_and\_param\_counts
- --------------------------------------------------
+finn.analysis.fpgadataflow.op\_and\_param\_counts
+--------------------------------------------------
 
- .. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
+.. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
     :members:
     :undoc-members:
     :show-inheritance:
 
+
 finn.analysis.fpgadataflow.post\_synth\_res
 --------------------------------------------------
 
@@ -54,6 +56,7 @@ finn.analysis.fpgadataflow.post\_synth\_res
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.res\_estimation
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 4e3de458e153871d1d5969442af5940dc1673ecd..afa1ecffa08213db6a282076c6fdf59694f9e13e 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -37,6 +37,15 @@ qonnx.core.modelwrapper
    :undoc-members:
    :show-inheritance:
 
+qonnx.core.onnx\_exec
+---------------------------
+
+.. automodule:: qonnx.core.onnx_exec
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.core.onnx\_exec
 ---------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index cc56ea603e589d7000fe5b2b2943e67cdb90c884..fdcf44c6d99561658b727dc64c0a1b98b247c7df 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -8,7 +8,7 @@ HLS Custom Op Nodes
 Base Class
 ----------
 
-.. automodule:: finn.custom_op.fpgadataflow
+.. automodule:: finn.custom_op.fpgadataflow.hlscustomop
    :members:
    :undoc-members:
    :show-inheritance:
@@ -29,9 +29,25 @@ finn.custom\_op.fpgadataflow.channelwise\_op\_batch
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.checksum
+--------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.checksum
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.concat
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.concat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 
 finn.custom\_op.fpgadataflow.convolutioninputgenerator
--------------------------------------------------------------
+--------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator
    :members:
@@ -46,6 +62,15 @@ finn.custom\_op.fpgadataflow.convolutioninputgenerator1d
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl
+------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.downsampler
 -----------------------------------------
 
@@ -62,6 +87,16 @@ finn.custom\_op.fpgadataflow.duplicatestreams\_batch
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.eltwise
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.eltwise
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.fmpadding\_batch
 -----------------------------------------------
 
@@ -79,7 +114,7 @@ finn.custom\_op.fpgadataflow.globalaccpool\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.iodma
------------------------------------------------
+------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.iodma
    :members:
@@ -102,6 +137,15 @@ finn.custom\_op.fpgadataflow.lookup
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.matrixvectoractivation
+-----------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.pool\_batch
 -----------------------------------------------
 
@@ -127,14 +171,6 @@ finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.fpgadataflow.matrixvectoractivation
------------------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.custom\_op.fpgadataflow.streamingfifo
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 20d90a7bb596d6ce5638d9b2d9bae8a5c7e5c723..cdbe957c713ef6916e4ed7baabe09135f71fdeef 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -9,6 +9,7 @@ Submodules
    :maxdepth: 2
 
    finn.custom_op.fpgadataflow
+   qonnx.custom_op.channels_last
    qonnx.custom_op.general
 
 Custom Op Nodes
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index b1e7075bdcfb675a894f3e66b61d59117e4f078d..9f8ec079309f16daa022e14317ebddfd7758d639 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -62,6 +62,14 @@ finn.transformation.fpgadataflow.create\_stitched\_ip
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.derive\_characteristic
+------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.derive_characteristic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.fpgadataflow.externalize\_params
 ------------------------------------------------------------
 
@@ -103,6 +111,17 @@ finn.transformation.fpgadataflow.insert\_fifo
    :undoc-members:
    :show-inheritance:
 
+
+finn.transformation.fpgadataflow.insert\_hook
+----------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.insert_hook
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+
 finn.transformation.fpgadataflow.insert\_iodma
 ----------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index 6a28eeedb2aa547ba80677864ae9fb8c6aa64097..f42b595a50ec90ef055e2818d66f4b2410c25594 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -20,7 +20,7 @@ Transformation Passes
 Base Class
 ----------
 
-.. automodule:: finn.transformation
+.. automodule:: qonnx.transformation.base
    :members:
    :undoc-members:
    :show-inheritance:
@@ -42,7 +42,7 @@ qonnx.transformation.bipolar\_to\_xnor
    :show-inheritance:
 
 qonnx.transformation.change\_3d\_tensors\_to\_4d
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.change_3d_tensors_to_4d
   :members:
@@ -57,8 +57,18 @@ qonnx.transformation.change\_datalayout
   :undoc-members:
   :show-inheritance:
 
+
+qonnx.transformation.channels\_last
+--------------------------------------------
+
+.. automodule:: qonnx.transformation.channels_last
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 qonnx.transformation.create\_generic\_partitions
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.create_generic_partitions
   :members:
@@ -171,13 +181,22 @@ qonnx.transformation.merge\_onnx\_models
   :show-inheritance:
 
 
-finn.transformation.move\_reshape
+qonnx.transformation.quant\_constant\_folding
+----------------------------------------------
+
+.. automodule:: qonnx.transformation.quant_constant_folding
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
+qonnx.transformation.rebalance\_conv
 ----------------------------------------
 
-.. automodule:: finn.transformation.move_reshape
-   :members:
-   :undoc-members:
-   :show-inheritance:
+.. automodule:: qonnx.transformation.rebalance_conv
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
 qonnx.transformation.remove
 -------------------------------------
@@ -186,3 +205,12 @@ qonnx.transformation.remove
   :members:
   :undoc-members:
   :show-inheritance:
+
+
+finn.transformation.move\_reshape
+----------------------------------------
+
+.. automodule:: finn.transformation.move_reshape
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 8dffa016327c3bbe50f21278c859c83556b2b213..7ba3b252abfa0086a8c0281eb9a792fb239d6ec3 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -14,6 +14,15 @@ qonnx.util.basic
    :show-inheritance:
 
 
+qonnx.util.cleanup
+----------------------
+
+.. automodule:: qonnx.util.cleanup
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 qonnx.util.config
 --------------------
 
@@ -22,6 +31,40 @@ qonnx.util.config
   :undoc-members:
   :show-inheritance:
 
+qonnx.util.exec\_qonnx
+----------------------
+
+.. automodule:: qonnx.util.exec_qonnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.inference\_cost
+--------------------------
+
+.. automodule:: qonnx.util.inference_cost
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.onnx
+-------------------
+
+.. automodule:: qonnx.util.onnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.util.to\_channels\_last
+------------------------------
+
+.. automodule:: qonnx.util.to_channels_last
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.util.basic
 ----------------------
 
@@ -64,6 +107,15 @@ finn.util.gdrive
   :undoc-members:
   :show-inheritance:
 
+finn.util.hls
+---------------
+
+.. automodule:: finn.util.hls
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.util.imagenet
 -----------------------------
 
@@ -72,14 +124,6 @@ finn.util.imagenet
   :undoc-members:
   :show-inheritance:
 
-qonnx.util.onnx
----------------------
-
-.. automodule:: qonnx.util.onnx
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.util.platforms
 --------------------
 
diff --git a/docs/finn/source_code/modules.rst b/docs/finn/source_code/modules.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/finn/source_code/qonnx.custom_op.channels_last.rst b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ad10d94a6b34a99e2213994a75b0f063fd3d36f
--- /dev/null
+++ b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
@@ -0,0 +1,41 @@
+**************************
+Custom Op - Channels Last
+**************************
+
+Channels Last Custom Ops
+=========================
+
+qonnx.custom\_op.channels\_last.base\_wrapped\_op
+--------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.base_wrapped_op
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.batch\_normalization
+------------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.batch_normalization
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.conv
+--------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.conv
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.max\_pool
+------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.max_pool
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/fetch-repos.sh b/fetch-repos.sh
index b0f6400ed142b203b1c9f6d7ea4ac6ababcf34d1..5e668e04499fcf825382dc2785a92dc01c0e7d88 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6"
+QONNX_COMMIT="7d50273a4dcccb445fb06f57f6bedc17b3707b35"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
diff --git a/requirements.txt b/requirements.txt
index 9038a5e8170301421529e0b570482316e4fff20a..348b1afab9deca1547d40cb8d8c54a396befa65d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,6 @@ bitstring==3.1.7
 clize==4.1.1
 dataclasses-json==0.5.7
 docrep==0.2.7
-future==0.18.2
 gspread==3.6.0
 numpy==1.22.0
 onnx==1.11.0
@@ -10,6 +9,7 @@ onnxoptimizer
 onnxruntime==1.11.1
 pre-commit==2.9.2
 protobuf==3.20.2
+psutil==5.9.4
 pyscaffold==3.2.1
 scipy==1.5.2
 setupext-janitor>=1.1.2
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index d3c4156d9b4ccf601d3eea348f6cb61c0d9a6e87..80934d812fd7a165fdaba9c3fb17dc37e5a82d49 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -259,6 +259,10 @@ class DataflowBuildConfig:
         AutoFIFOSizingMethod
     ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
 
+    #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
+    #: if set to True, always using Python instead
+    force_python_rtlsim: Optional[bool] = False
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 5da608c27def8136f9ad11f62b4707452eac3120..956b4fd3be863d74e5c33de2faf69b56ebef9406 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -30,6 +30,7 @@ import json
 import numpy as np
 import os
 import shutil
+import warnings
 from copy import deepcopy
 from distutils.dir_util import copy_tree
 from qonnx.core.modelwrapper import ModelWrapper
@@ -113,6 +114,7 @@ from finn.util.basic import (
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
 )
+from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
 
 
@@ -531,11 +533,20 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
         elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
+            # multi-in/out streams currently not supported in our C++ verilator driver
+            model_multi_io = len(model.graph.input) > 1 or len(model.graph.output) > 1
+            force_python_sim = model_multi_io or cfg.force_python_rtlsim
+            if model_multi_io:
+                warnings.warn(
+                    "Multi-in/out streams currently not supported "
+                    + "in FINN C++ verilator driver, falling back to Python"
+                )
             model = model.transform(
                 InsertAndSetFIFODepths(
                     cfg._resolve_fpga_part(),
                     cfg._resolve_hls_clk_period(),
                     vivado_ram_style=cfg.large_fifo_mem_style,
+                    force_python_sim=force_python_sim,
                 )
             )
         else:
@@ -632,20 +643,48 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
         # prepare ip-stitched rtlsim
         rtlsim_model = deepcopy(model)
         rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg)
-        # run with single input to get latency
-        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+        # multi-in/out streams currently not supported in our C++ verilator driver
+        model_multi_io = (
+            len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1
+        )
+        force_python_rtlsim = cfg.force_python_rtlsim or model_multi_io
+        if model_multi_io:
+            warnings.warn(
+                "Multi-in/out streams currently not supported "
+                + "in FINN C++ verilator driver, falling back to Python"
+            )
         rtlsim_bs = int(cfg.rtlsim_batch_size)
-        assert rtlsim_bs > 0, "rtlsim batch size must be >0"
-        if cfg.verify_save_rtlsim_waveforms:
-            # set depth to 3 for layer-by-layer visibility
-            os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+        if force_python_rtlsim:
+            # run with single input to get latency
+            orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+            assert rtlsim_bs > 0, "rtlsim batch size must be >0"
+            if cfg.verify_save_rtlsim_waveforms:
+                # set depth to 3 for layer-by-layer visibility
+                os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+                rtlsim_model.set_metadata_prop(
+                    "rtlsim_trace",
+                    "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs),
+                )
             rtlsim_model.set_metadata_prop(
-                "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs)
+                "extra_verilator_args", str(["-CFLAGS", "-O3"])
             )
-        rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
-        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
-        rtlsim_latency = rtlsim_perf_dict["cycles"]
-        rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+            rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
+            rtlsim_latency = rtlsim_perf_dict["cycles"]
+            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+        else:
+            rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
+            # keep keys consistent between the Python and C++-styles
+            cycles = rtlsim_perf_dict["cycles"]
+            clk_ns = float(model.get_metadata_prop("clk_ns"))
+            fclk_mhz = 1 / (clk_ns * 0.001)
+            runtime_s = (cycles * clk_ns) * (10**-9)
+            rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000
+            rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s
+            rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz
+            for (key, val) in rtlsim_perf_dict.items():
+                if "max_count" in key:
+                    del rtlsim_perf_dict[key]
+
         with open(report_dir + "/rtlsim_performance.json", "w") as f:
             json.dump(rtlsim_perf_dict, f, indent=2)
         if cfg.verify_save_rtlsim_waveforms:
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index f307be95c30d822dfc517e4c331bd8d82d727997..d1326607aa0dc5c34eef105b2ceb8ed86c1a0458 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -43,6 +43,7 @@ from finn.util.basic import (
     pyverilate_get_liveness_threshold_cycles,
 )
 from finn.util.hls import CallHLS
+from finn.util.pyverilator import make_single_source_file
 
 from . import templates
 
@@ -174,7 +175,7 @@ class HLSCustomOp(CustomOp):
         # default impl only returns the HLS verilog codegen dir
         return [verilog_path]
 
-    def get_all_verilog_filenames(self):
+    def get_all_verilog_filenames(self, abspath=False):
         "Return list of all Verilog files used for this node."
 
         verilog_files = []
@@ -182,7 +183,10 @@ class HLSCustomOp(CustomOp):
         for verilog_path in verilog_paths:
             for f in os.listdir(verilog_path):
                 if f.endswith(".v"):
-                    verilog_files += [f]
+                    if abspath:
+                        verilog_files += [verilog_path + "/" + f]
+                    else:
+                        verilog_files += [f]
         return verilog_files
 
     def prepare_rtlsim(self):
@@ -192,13 +196,18 @@ class HLSCustomOp(CustomOp):
 
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
-        verilog_paths = self.get_all_verilog_paths()
-        verilog_files = self.get_all_verilog_filenames()
+
+        verilog_files = self.get_all_verilog_filenames(abspath=True)
+        single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
+        tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
+        target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v"
+        make_single_source_file(verilog_files, target_file)
+
         # build the Verilator emu library
         sim = PyVerilator.build(
-            verilog_files,
-            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=verilog_paths,
+            self.get_verilog_top_module_name() + ".v",
+            build_dir=tmp_build_dir,
+            verilog_path=[single_src_dir],
             trace_depth=get_rtlsim_trace_depth(),
             top_module_name=self.get_verilog_top_module_name(),
         )
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index df9d1f1e70674f7bc91460e154f4e24af08df79c..72128fda4cfe23db4858fe3ffe80a755733954cc 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -576,6 +576,10 @@ class MatrixVectorActivation(HLSCustomOp):
 
     def minimize_accumulator_width(self, model):
         weights = model.get_initializer(self.onnx_node.input[1])
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
         else:
@@ -702,10 +706,12 @@ class MatrixVectorActivation(HLSCustomOp):
         of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index 91cd537baeff0c7666bbf3596b46a7412ec2fe4e..813f13e504eae181f4398eccbe40ad66b6e3bf16 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -42,12 +42,13 @@ class Pool_Batch(HLSCustomOp):
     Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
 
     Notes:
-    # The input shape was chosen to be compatible with im2col (only true when there
-    is not folding).
 
-    # The actual data layout produced by the hlslib kernels is different
-    for depthwise ops.
-     * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
+    * The input shape was chosen to be compatible with im2col (only true when there
+      is not folding).
+    * The actual data layout produced by the hlslib kernels is different
+      for depthwise ops.
+
+        * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
 
     Channels can be folded using PE (SIMD from the input perspective)
     """
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index ec5742376798fa7a31ab1e3e6a13ed1a4a0607ac..03cfbe0e09be092cf9446f29110b01ef0cc1e367 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -354,10 +354,12 @@ class Thresholding_Batch(HLSCustomOp):
         run-time reconfig of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
         tdt = self.get_weight_datatype()
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index a411d245a9a397e6d827abfa8ee4a784f207ecd5..d5e29ca22acf89440c3c3a66101bec89d4a66d46 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -410,10 +410,12 @@ class VectorVectorActivation(HLSCustomOp):
         of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
diff --git a/src/finn/qnn-data/cpp/verilator_fifosim.cpp b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0aca9efe77806d31192f35a1d751b32116218f8
--- /dev/null
+++ b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
@@ -0,0 +1,197 @@
+/* Copyright (C) 2022, Advanced Micro Devices, Inc.
+All rights reserved.
+#
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+#
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+#
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+#
+* Neither the name of FINN nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+#
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+#include <iostream>
+#include <fstream>
+#include <cstddef>
+#include <chrono>
+#include "verilated.h"
+#include "verilated_vcd_c.h"
+#include "Vfinn_design_wrapper.h"
+
+#ifdef DEBUG
+#define TRACE(x) x
+#else
+#define TRACE(x) ;
+#endif
+
+using namespace std;
+
+Vfinn_design_wrapper * top;
+
+// code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+// this is required by verilator for verilog designs using $time
+// main_time is incremented in eval
+double main_time = 0;
+
+double sc_time_stamp() {
+return main_time;
+}
+// function definitions
+// helper functions for basic verilator tasks
+extern "C" { //Open an extern C closed below
+Vfinn_design_wrapper* construct() {
+    Verilated::commandArgs(0, (const char**) nullptr);
+    TRACE(Verilated::traceEverOn(true));
+    Vfinn_design_wrapper* top = new Vfinn_design_wrapper();
+    return top;
+}
+int eval(Vfinn_design_wrapper* top) {
+    top->eval();
+    main_time++;
+    return 0;
+}
+int destruct(Vfinn_design_wrapper* top) {
+    if (top != nullptr) {
+        delete top;
+        top = nullptr;
+    }
+    return 0;
+}
+
+TRACE(
+VerilatedVcdC* tfp;
+VerilatedVcdC* start_vcd_trace(Vfinn_design_wrapper* top, const char* filename) {
+    VerilatedVcdC* tfp = new VerilatedVcdC;
+    top->trace(tfp, 99);
+    tfp->open(filename);
+    return tfp;
+}
+int add_to_vcd_trace(VerilatedVcdC* tfp, int time) {
+    tfp->dump(time);
+    return 0;
+}
+int flush_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->flush();
+    return 0;
+}
+int stop_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->close();
+    return 0;
+}
+)
+
+}
+
+// end of code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+inline void toggle_clk() {
+    eval(top);
+    top->ap_clk = 1;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+    eval(top);
+    top->ap_clk = 0;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+}
+
+
+void reset() {
+    top->ap_rst_n = 0;
+    for(unsigned i = 0; i < 10; i++) {
+        toggle_clk();
+    }
+    top->ap_rst_n = 1;
+}
+
+int main(int argc, char *argv[]) {
+    top = construct();
+    TRACE(tfp = start_vcd_trace(top, "trace.vcd"));
+    unsigned n_iters_per_input = @ITERS_PER_INPUT@;
+    unsigned n_iters_per_output = @ITERS_PER_OUTPUT@;
+    unsigned n_inputs = @N_INPUTS@;
+    unsigned max_iters = @MAX_ITERS@;
+
+    reset();
+
+    top->m_axis_0_tready = 1;
+    top->s_axis_0_tvalid = 1;
+
+    unsigned n_in_txns = 0, n_out_txns = 0, iters = 0, last_output_at = 0;
+    unsigned latency = 0;
+
+    bool exit_criterion = false;
+
+    cout << "Simulation starting" << endl;
+    cout << "Number of inputs to write " << n_iters_per_input * n_inputs << endl;
+    cout << "Number of outputs to expect " << n_iters_per_output * n_inputs << endl;
+    cout << "No-output timeout clock cycles " << max_iters << endl;
+
+    chrono::steady_clock::time_point begin = chrono::steady_clock::now();
+
+    while(!exit_criterion) {
+        toggle_clk();
+        iters++;
+        if(iters % 1000 == 0) {
+            cout << "Elapsed iters " << iters << " inps " << n_in_txns << " outs " << n_out_txns << endl;
+            chrono::steady_clock::time_point end = chrono::steady_clock::now();
+            cout << "Elapsed since last report = " << chrono::duration_cast<chrono::seconds>(end - begin).count() << "[s]" << endl;
+            begin = end;
+        }
+        if(top->s_axis_0_tready == 1 && top->s_axis_0_tvalid == 1) {
+            n_in_txns++;
+            if(n_in_txns == n_iters_per_input * n_inputs) {
+                top->s_axis_0_tvalid = 0;
+                cout << "All inputs written at cycle " << iters << endl;
+            }
+        }
+        if(top->m_axis_0_tvalid == 1) {
+            n_out_txns++;
+            last_output_at = iters;
+            if(n_out_txns == n_iters_per_output) {
+                latency = iters;
+            }
+        }
+
+        exit_criterion = ((n_in_txns >= n_iters_per_input * n_inputs) && (n_out_txns >= n_iters_per_output * n_inputs)) || ((iters-last_output_at) > max_iters);
+    }
+
+    TRACE(flush_vcd_trace(tfp));
+    TRACE(stop_vcd_trace(tfp));
+
+    cout << "Simulation finished" << endl;
+    cout << "Number of inputs consumed " << n_in_txns << endl;
+    cout << "Number of outputs produced " << n_out_txns << endl;
+    cout << "Number of clock cycles " << iters << endl;
+
+    ofstream results_file;
+    results_file.open("results.txt", ios::out | ios::trunc);
+    results_file << "N_IN_TXNS" << "\t" << n_in_txns << endl;
+    results_file << "N_OUT_TXNS" << "\t" << n_out_txns << endl;
+    results_file << "cycles" << "\t" << iters << endl;
+    results_file << "N" << "\t" << n_inputs << endl;
+    results_file << "latency_cycles" << "\t" << latency << endl;
+@FIFO_DEPTH_LOGGING@
+    results_file.close();
+
+
+
+    destruct(top);
+
+    return 0;
+}
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 38c2d60c9a09644ceec07e11bf91315ee743e02c..0546643d1220603d40651c45a0c4032dcf5cfaaf 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -71,13 +71,15 @@ class InsertFIFO(Transformation):
     of the subsequent node. max() of these two values sets the FIFO depth.
 
     Constructor arguments:
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
-    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado
-    - create_shallow_fifos : Normally, shallow-depth (<=2) FIFOs won't be created since
-                            HLS streaming interfaces already have a degree of buffering.
-                            Override with this parameter.
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute
+        to be used for large FIFOs implemented by Vivado
+    :parameter create_shallow_fifos: Normally, shallow-depth (<=2) FIFOs
+        won't be created since HLS streaming interfaces
+        already have a degree of buffering.
+        Override with this parameter.
 
 
     The other node attributes necessary to create a FIFO node are taken from the
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 5b3ead6d67d1f921ca070c01e976cbff43d7962a..80f5d9a09439a4ba4a7409a972987dc09acd8031 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -42,7 +42,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.util.pyverilator import pyverilate_stitched_ip
+from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim
 
 
 def reset_implementation(node):
@@ -72,8 +72,9 @@ def optimize_depth(depth):
         # Q_srl FIFOs do not benefit from size < 32
         # add some slack
         return 32
-    # round to nearest power of two for Vivado IP FIFO implementation
-    return int(2 ** math.ceil(math.log2(depth)))
+    # otherwise leave as is
+    # will be rounded to nearest power of two for Vivado-style FIFO
+    return int(depth)
 
 
 class RemoveShallowFIFOs(Transformation):
@@ -125,14 +126,17 @@ class CapConvolutionFIFODepths(Transformation):
     constructor flag is set.
 
     Constructor arguments:
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - FIFOs inserted with InsertAndSetFIFODepths
 
     Output:
+
     - graph with smaller-depth FIFOs for convolutions
 
     Background:
@@ -188,22 +192,25 @@ class InsertAndSetFIFODepths(Transformation):
     throughput in the created accelerator.
 
     Constructor arguments:
-    - clk_ns : clock period (used for IP preparation)
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
-    - max_depth : how deep the "max"-sized FIFOs initially inserted will be
-                   if set to None, use the tensor size as the depth
-    - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
-                        smaller where appropriate
-    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado afterwards
+
+    :parameter clk_ns: clock period (used for IP preparation)
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter max_depth: how deep the "max"-sized FIFOs initially inserted
+        will be. If set to None, use the tensor size as the depth
+    :parameter swg_exception: call CapConvolutionFIFODepths to make convolution FIFOs
+        smaller where appropriate
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute to be used
+        for large FIFOs implemented by Vivado afterwards
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - no FIFOs inserted,
     - (inFIFODepths/outFIFODepths attrs will be ignored)
 
     Output:
+
     - graph with appropriate-depth FIFOs inserted
 
     Background:
@@ -211,12 +218,14 @@ class InsertAndSetFIFODepths(Transformation):
     necessary to insert FIFOs between them to prevent stalls due to bursty
     behavior. The sizes of those FIFOs are hard to predict analytically, so
     we do the following:
+
     - insert deep (=tensor size) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
     - when sim finished, update each FIFO depth to maximum observed occupancy
       and set inFIFODepths/outFIFODepths attrs to 0 on relevant nodes
+
     """
 
     def __init__(
@@ -227,6 +236,7 @@ class InsertAndSetFIFODepths(Transformation):
         max_depth=None,
         swg_exception=True,
         vivado_ram_style="auto",
+        force_python_sim=False,
     ):
         super().__init__()
         self.fpgapart = fpgapart
@@ -235,6 +245,7 @@ class InsertAndSetFIFODepths(Transformation):
         self.max_depth = max_depth
         self.swg_exception = swg_exception
         self.vivado_ram_style = vivado_ram_style
+        self.force_python_sim = force_python_sim
 
     def apply(self, model):
         # these optypes may potentially use external weights
@@ -306,57 +317,75 @@ class InsertAndSetFIFODepths(Transformation):
         model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
         model.set_metadata_prop("exec_mode", "rtlsim")
 
-        # calculate input frequency (number of cycles for each input word)
-        first_node = getCustomOp(model.graph.node[0])
-        ncycles_per_input = max(
-            1,
-            int(
-                math.ceil(
-                    perf["max_cycles"]
-                    / (
-                        np.prod(first_node.get_folded_input_shape())
-                        / first_node.get_folded_input_shape()[-1]
+        if self.force_python_sim:
+            # do rtlsim in Python for FIFO sizing
+            # calculate input frequency (number of cycles for each input word)
+            first_node = getCustomOp(model.graph.node[0])
+            ncycles_per_input = max(
+                1,
+                int(
+                    math.ceil(
+                        perf["max_cycles"]
+                        / (
+                            np.prod(first_node.get_folded_input_shape())
+                            / first_node.get_folded_input_shape()[-1]
+                        )
                     )
-                )
-            ),
-        )
+                ),
+            )
 
-        # set sufficiently large threshold for 1 image to  fully execute and exit
-        ncycles = int(latency + max_cycles)
+            # set sufficiently large threshold for 1 image to  fully execute and exit
+            ncycles = int(latency + max_cycles)
 
-        # prepare pyverilator model
-        sim = pyverilate_stitched_ip(model)
+            # prepare pyverilator model
+            sim = pyverilate_stitched_ip(model)
 
-        reset_rtlsim(sim)
-        toggle_clk(sim)
+            reset_rtlsim(sim)
+            toggle_clk(sim)
 
-        # set all input valids to 0 and output readies to 1
-        # set input data to some constant
-        set_signal(sim, "tvalid", 0)
-        set_signal(sim, "tready", 1)
-        set_signal(sim, "tdata", 0)
+            # set all input valids to 0 and output readies to 1
+            # set input data to some constant
+            set_signal(sim, "tvalid", 0)
+            set_signal(sim, "tready", 1)
+            set_signal(sim, "tdata", 0)
+
+            output_detected = False
+            while ncycles > 0:
+                toggle_clk(sim)
+                # set/unset valids
+                if ncycles % ncycles_per_input == 0:
+                    set_signal(sim, "tvalid", 1)
+                else:
+                    set_signal(sim, "tvalid", 0)
 
-        output_detected = False
-        while ncycles > 0:
-            toggle_clk(sim)
-            # set/unset valids
-            if ncycles % ncycles_per_input == 0:
-                set_signal(sim, "tvalid", 1)
-            else:
-                set_signal(sim, "tvalid", 0)
+                # since latency estimation is very pessimistic, detect first output
+                # and fast-forward the sim
+                if get_signal(sim, "tvalid") != 0 and not output_detected:
+                    ncycles = max_cycles
+                    output_detected = True
+                else:
+                    ncycles = ncycles - 1
 
-            # since latency estimation is very pessimistic, detect first output
-            # and fast-forward the sim
-            if get_signal(sim, "tvalid") != 0 and not output_detected:
-                ncycles = max_cycles
-                output_detected = True
+            if not output_detected:
+                warnings.warn(
+                    "No output detected, calculated FIFO depths may not be correct"
+                )
+        else:
+            # do rtlsim in C++ for FIFO sizing
+            # determine # inputs for FIFO sizing according to topology type
+            swg_nodes = [
+                x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type
+            ]
+            if len(swg_nodes) == 0:
+                # MLP, no layer overlap
+                # assuming half the nodes are now FIFOs, use half the # of
+                # nodes as # inputs to drive the imulation
+                n_inputs = int(len(model.graph.node) / 2)
             else:
-                ncycles = ncycles - 1
-
-        if not output_detected:
-            warnings.warn(
-                "No output detected, calculated FIFO depths may not be correct"
-            )
+                # convnet, single input is typically enough to fill entire
+                # layer pipeline due to overlaps
+                n_inputs = 1
+            sim = verilator_fifosim(model, n_inputs)
 
         for ind, node in enumerate(fifo_nodes):
             maxcount_name = "maxcount_%d" % ind
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index e24e24f1f8ebb2873c81617884cd333311d8aea9..2301fccdd4fff6310340ffe1dd8de7732a4f9bd4 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -62,17 +62,20 @@ class SetFolding(Transformation):
 
     Notable exceptions and special behavior:
 
-    * When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
+    When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
     which have two attributes (PE and SIMD):
-        * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
-          (configurable in the SetFolding initializer, defaults to 36)
-        * then increases PE until the target is met or max PE reached
 
-    * When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
+    * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
+      (configurable in the SetFolding initializer, defaults to 36)
+    * then increases PE until the target is met or max PE reached
+
+    When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
     or spatial reduction ops (Pool_Batch):
-        * the producer of the node is expected to be a ConvolutionInputGenerator
-        with depthwise=1, whose SIMD value will be set equal to the PE value of
-        its consumer node
+
+    * the producer of the node is expected to be a ConvolutionInputGenerator
+      with depthwise=1, whose SIMD value will be set equal to the PE value of
+      its consumer node
+
     """
 
     def __init__(
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 97da4d41524e7c86fb5a73375d9ea2c2c9aa10dc..e0a5666000fc2aa9599bb7475c1b8dd37489afac 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -358,16 +358,16 @@ class VitisBuild(Transformation):
     """Best-effort attempt at building the accelerator with Vitis.
     It assumes the model has only fpgadataflow nodes
 
-    fpga_part: string identifying the target FPGA
-    period_ns: target clock period
-    platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
-    strategy: Vitis optimization strategy
-    enable_debug: add Chipscope to all AXI interfaces
-    floorplan_file: path to a JSON containing a dictionary with SLR assignments
-                    for each node in the ONNX graph. Must be parse-able by
-                    the ApplyConfig transform.
-    enable_link: enable linking kernels (.xo files), otherwise just synthesize
-                    them independently.
+    :parameter fpga_part: string identifying the target FPGA
+    :parameter period_ns: target clock period
+    :parameter platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
+    :parameter strategy: Vitis optimization strategy
+    :parameter enable_debug: add Chipscope to all AXI interfaces
+    :parameter floorplan_file: path to a JSON containing a dictionary with
+        SLR assignments for each node in the ONNX graph.
+        Must be parse-able by the ApplyConfig transform.
+    :parameter enable_link: enable linking kernels (.xo files),
+        otherwise just synthesize them independently.
     """
 
     def __init__(
diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
index 967a1276365e4af1a6d617c081b9c04b4710da97..34f11d1e95e6bc3f6a36ce6d878ed493108b3ba6 100644
--- a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
+++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
@@ -56,12 +56,12 @@ class ConvertQONNXtoFINN(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index a50a5850779cadf7ab21b9c1c4dfdbb36232af42..9819086d826a51d1df5240d88c4fda8513cc9ba6 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -52,9 +52,7 @@ class QuantActBaseHandler(ABC):
         self._q_node = quant_node
         self._q_index = quant_node_index
 
-    @property
     @classmethod
-    @abstractmethod
     def valid_predecessor_op_types(self):
         """Defines which op types the preceding node is allowed to have for
         this type of activation.
@@ -284,9 +282,11 @@ class QuantReluHandler(QuantActBaseHandler):
     """Class for converting a quantized relu operation expressed in the QONNX
     dialect to the FINN ONNX dialect."""
 
-    valid_predecessor_op_types = [
-        "Relu",
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "Relu",
+        ]
 
     def _check_compatibility(self):
         if self._q_node.op_type == "Quant":
@@ -391,15 +391,17 @@ class QuantIdentityHandler(QuantActBaseHandler):
     these are equivalent to quantized identity activations.
     """
 
-    valid_predecessor_op_types = [
-        "BatchNormalization",
-        "Sub",
-        "Add",
-        "Mul",
-        "Div",
-        "DebugMarker",
-        None,
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "BatchNormalization",
+            "Sub",
+            "Add",
+            "Mul",
+            "Div",
+            "DebugMarker",
+            None,
+        ]
 
     def _check_compatibility(self):
         # Gather parameters to check
diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
index 77025ecdf57d5a422992d4163d05c740454986bb..48dda3820deb051bd8a291188f02fe7d1dd2cc0b 100644
--- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
+++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
@@ -30,7 +30,10 @@
 import warnings
 from qonnx.transformation.base import Transformation
 
-from finn.transformation.qonnx.qonnx_activation_handlers import QuantActBaseHandler
+from finn.transformation.qonnx.qonnx_activation_handlers import (
+    QuantActBaseHandler,
+    QuantIdentityHandler,
+)
 
 
 def default_filter_function_generator(max_multithreshold_bit_width=8):
@@ -66,8 +69,7 @@ def default_filter_function_generator(max_multithreshold_bit_width=8):
 
 
 class ConvertQuantActToMultiThreshold(Transformation):
-    """
-    Converts Quant nodes in the activation path to MultiThreshold nodes.
+    """Converts Quant nodes in the activation path to MultiThreshold nodes.
 
     The optional keyword argument `filter_function`
     presents a way to control which Quant and BipolarQuant nodes in the activation path
@@ -75,12 +77,12 @@ class ConvertQuantActToMultiThreshold(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
@@ -127,7 +129,7 @@ class ConvertQuantActToMultiThreshold(Transformation):
                 # Check for possible ambiguity in handler selection
                 valid_predecessors = []
                 for cls in QuantActBaseHandler.__subclasses__():
-                    valid_predecessors.extend(cls.valid_predecessor_op_types)
+                    valid_predecessors.extend(cls.valid_predecessor_op_types())
                 if len(valid_predecessors) != len(set(valid_predecessors)):
                     raise RuntimeError(
                         "Two or more activation handlers declare the same "
@@ -138,16 +140,15 @@ class ConvertQuantActToMultiThreshold(Transformation):
 
                 # Try to find a fitting handler for this Quant activation node
                 for handler_cls in QuantActBaseHandler.__subclasses__():
-                    if predecessor_op_type in handler_cls.valid_predecessor_op_types:
+                    if predecessor_op_type in handler_cls.valid_predecessor_op_types():
                         handler = handler_cls(model, n, node_ind)
                         break
                 else:
-                    raise ValueError(
-                        f"Quant nodes in the activation path and with predecessor "
-                        f"nodes of type {predecessor_op_type} are currently not "
-                        f"supported by FINN and can not be converted to "
-                        f"MultiThreshold nodes."
-                    )
+                    # fall back to QuantIdentityHandler here
+                    # it may still not work due to its particular restrictions,
+                    # but better than just erroring out without trying
+                    handler = QuantIdentityHandler(model, n, node_ind)
+
                 model = handler.replace_quant_node()
                 graph_modified = True
                 return (model, graph_modified)
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 65478d2540b53443d3f74b44a22fde3defd8ca93..797dad32a2cfeb3e00e224f264d91b5ee0e9247b 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -265,7 +265,7 @@ def numpy_to_hls_code(
     # define a function to convert a single element into a C++ init string
     # a single element can be a hex string if we are using packing
     def elem2str(x):
-        if type(x) == str or type(x) == np.str_ or type(x) == np.str:
+        if type(x) == str or type(x) == np.str_:
             return '%s("%s", 16)' % (hls_dtype, x)
         elif type(x) == np.float32:
             if dtype.is_integer():
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index d7ed3e261fe024b7f054382f12184628d3f3e94c..a00899cf784cbe3985b942af6b3d9a4c14cd8706 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -28,33 +28,41 @@
 
 import pkg_resources as pk
 
+import numpy as np
 import os
 import shutil
 from pyverilator import PyVerilator
+from qonnx.custom_op.registry import getCustomOp
 
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import (
+    get_rtlsim_trace_depth,
+    launch_process_helper,
+    make_build_dir,
+)
 
 
-def pyverilate_stitched_ip(
-    model,
-    read_internal_signals=True,
-    disable_common_warnings=True,
-    extra_verilator_args=[],
-):
-    """Given a model with stitched IP, return a PyVerilator sim object.
-    Trace depth is also controllable, see get_rtlsim_trace_depth()
+def make_single_source_file(filtered_verilog_files, target_file):
+    """Dump all Verilog code used by stitched IP into a single file.
+    This is because large models with many files require a verilator
+    command line too long for bash on most systems"""
 
-    :param read_internal_signals  If set, it will be possible to examine the
-        internal (not only port) signals of the Verilog module, but this may
-        slow down compilation and emulation.
+    # concatenate all verilog code into a single file
+    with open(target_file, "w") as wf:
+        for vfile in filtered_verilog_files:
+            with open(vfile) as rf:
+                wf.write("//Added from " + vfile + "\n\n")
+                lines = rf.read()
+                for line in lines.split("\n"):
+                    # break down too-long lines, Verilator complains otherwise
+                    if len(line) > 20000:
+                        line = line.replace("&", "\n&")
+                    wf.write("\n" + line)
 
-    :param disable_common_warnings If set, disable the set of warnings that
-        Vivado-HLS-generated Verilog typically triggers in Verilator
-        (which can be very verbose otherwise)
 
-    """
-    if PyVerilator is None:
-        raise ImportError("Installation of PyVerilator is required.")
+def prepare_stitched_ip_for_verilator(model):
+    """Prepare sources from given stitched IP for verilator simulation, including
+    generating a single source file and replacing certain Vivado infrastructure
+    headers with Verilator-compatible ones"""
 
     vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
     with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
@@ -67,8 +75,6 @@ def pyverilate_stitched_ip(
         return os.path.basename(os.path.realpath(x))
 
     top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
-    top_module_name = top_module_file_name.strip(".v")
-    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     # dump all Verilog code to a single file
     # this is because large models with many files require
@@ -79,7 +85,7 @@ def pyverilate_stitched_ip(
     # remove duplicates from list by doing list -> set -> list
     src_exts = [".v", ".sv"]
 
-    all_verilog_src_files = list(
+    all_verilog_files = list(
         set(
             filter(
                 lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs
@@ -87,7 +93,9 @@ def pyverilate_stitched_ip(
         )
     )
 
-    verilog_header_dir = make_build_dir("pyverilator_vh_")
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+    os.makedirs(verilog_header_dir, exist_ok=True)
+
     # use custom version of axis infrastructure vh
     # to enable Verilator to simulate AMD/Xilinx components (e.g DWC)
     custom_vh = pk.resource_filename(
@@ -105,7 +113,7 @@ def pyverilate_stitched_ip(
     # remove all but one instances of regslice_core.v
     filtered_verilog_files = []
     remove_entry = False
-    for vfile in all_verilog_src_files:
+    for vfile in all_verilog_files:
         if "regslice_core" in vfile:
             if not remove_entry:
                 filtered_verilog_files.append(vfile)
@@ -113,17 +121,159 @@ def pyverilate_stitched_ip(
         else:
             filtered_verilog_files.append(vfile)
 
-    # concatenate all verilog code into a single file
-    with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf:
-        for vfile in filtered_verilog_files:
-            with open(vfile) as rf:
-                wf.write("//Added from " + vfile + "\n\n")
-                lines = rf.read()
-                for line in lines.split("\n"):
-                    # break down too-long lines, Verilator complains otherwise
-                    if len(line) > 20000:
-                        line = line.replace("&", "\n&")
-                    wf.write("\n" + line)
+    target_file = vivado_stitch_proj_dir + "/" + top_module_file_name
+    make_single_source_file(filtered_verilog_files, target_file)
+
+    return vivado_stitch_proj_dir
+
+
+def verilator_fifosim(model, n_inputs, max_iters=100000000):
+    """Create a Verilator model of stitched IP and use a simple C++
+    driver to drive the input stream. Useful for FIFO sizing, latency
+    and throughput measurement."""
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    build_dir = make_build_dir("verilator_fifosim_")
+    fifosim_cpp_fname = pk.resource_filename(
+        "finn.qnn-data", "cpp/verilator_fifosim.cpp"
+    )
+    with open(fifosim_cpp_fname, "r") as f:
+        fifosim_cpp_template = f.read()
+    assert len(model.graph.input) == 1, "Only a single input stream is supported"
+    assert len(model.graph.output) == 1, "Only a single output stream is supported"
+    iname = model.graph.input[0].name
+    first_node = model.find_consumer(iname)
+    oname = model.graph.output[0].name
+    last_node = model.find_producer(oname)
+    assert (first_node is not None) and (
+        last_node is not None
+    ), "Failed to find first/last nodes"
+    fnode_inst = getCustomOp(first_node)
+    lnode_inst = getCustomOp(last_node)
+    ishape_folded = fnode_inst.get_folded_input_shape()
+    oshape_folded = lnode_inst.get_folded_output_shape()
+
+    fifo_log = []
+    fifo_log_templ = '    results_file << "maxcount%s" << "\\t" '
+    fifo_log_templ += "<< to_string(top->maxcount%s) << endl;"
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+    fifo_ind = 0
+    for fifo_node in fifo_nodes:
+        fifo_node = getCustomOp(fifo_node)
+        if fifo_node.get_nodeattr("depth_monitor") == 1:
+            suffix = "" if fifo_ind == 0 else "_%d" % fifo_ind
+            fifo_log.append(fifo_log_templ % (suffix, suffix))
+            fifo_ind += 1
+    fifo_log = "\n".join(fifo_log)
+
+    template_dict = {
+        "ITERS_PER_INPUT": np.prod(ishape_folded[:-1]),
+        "ITERS_PER_OUTPUT": np.prod(oshape_folded[:-1]),
+        "N_INPUTS": n_inputs,
+        "MAX_ITERS": max_iters,
+        "FIFO_DEPTH_LOGGING": fifo_log,
+    }
+
+    for (key, val) in template_dict.items():
+        fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val))
+
+    with open(build_dir + "/verilator_fifosim.cpp", "w") as f:
+        f.write(fifosim_cpp_template)
+
+    which_verilator = shutil.which("verilator")
+    if which_verilator is None:
+        raise Exception("'verilator' executable not found")
+
+    verilator_args = [
+        "perl",
+        which_verilator,
+        "-Wno-fatal",
+        "-Mdir",
+        build_dir,
+        "-y",
+        vivado_stitch_proj_dir,
+        "--CFLAGS",
+        "--std=c++11",
+        "-O3",
+        "--x-assign",
+        "fast",
+        "--x-initial",
+        "fast",
+        "--noassert",
+        "--cc",
+        "finn_design_wrapper.v",
+        "--top-module",
+        "finn_design_wrapper",
+        "--exe",
+        "verilator_fifosim.cpp",
+        "--threads",
+        "4",
+    ]
+
+    proc_env = os.environ.copy()
+    gcc_args = "-O3 -march=native"
+    proc_env["OPT_FAST"] = gcc_args
+    make_args = [
+        "make",
+        "-j4",
+        "-C",
+        build_dir,
+        "-f",
+        "Vfinn_design_wrapper.mk",
+        "Vfinn_design_wrapper",
+    ]
+
+    with open(build_dir + "/compile.sh", "w") as f:
+        f.write("#!/bin/bash" + "\n")
+        f.write("export OPT_FAST='%s'\n" % gcc_args)
+        f.write(" ".join(verilator_args) + "\n")
+        f.write(" ".join(make_args) + "\n")
+
+    launch_process_helper(verilator_args, cwd=build_dir)
+    launch_process_helper(make_args, proc_env=proc_env, cwd=build_dir)
+
+    sim_launch_args = ["./Vfinn_design_wrapper"]
+    launch_process_helper(sim_launch_args, cwd=build_dir)
+
+    with open(build_dir + "/results.txt", "r") as f:
+        results = f.read().strip().split("\n")
+    ret_dict = {}
+    for result_line in results:
+        key, val = result_line.split("\t")
+        ret_dict[key] = int(val)
+    return ret_dict
+
+
+def pyverilate_stitched_ip(
+    model,
+    read_internal_signals=True,
+    disable_common_warnings=True,
+    extra_verilator_args=[],
+):
+    """Given a model with stitched IP, return a PyVerilator sim object.
+    Trace depth is also controllable, see get_rtlsim_trace_depth()
+
+    :param read_internal_signals  If set, it will be possible to examine the
+        internal (not only port) signals of the Verilog module, but this may
+        slow down compilation and emulation.
+
+    :param disable_common_warnings If set, disable the set of warnings that
+        Vivado-HLS-generated Verilog typically triggers in Verilator
+        (which can be very verbose otherwise)
+
+    """
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+
+    def file_to_basename(x):
+        return os.path.basename(os.path.realpath(x))
+
+    top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+    top_module_name = top_module_file_name.strip(".v")
+    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     verilator_args = []
     # disable common verilator warnings that should be harmless but commonly occur
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index bfe4aa0bb826c73f6a7c67f025e24764da8c36cc..bd8bde2820fa87ed972d699cae905d7f6cc310ff 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -91,8 +91,8 @@ def soft_verify_topk(invec, idxvec, k):
     """Check that the topK indices provided actually point to the topK largest
     values in the input vector"""
     np_topk = np.flip(invec.flatten().argsort())[:k]
-    soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()]
-    soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()]
+    soft_expected = invec.flatten()[np_topk.astype(np.int_).flatten()]
+    soft_produced = invec.flatten()[idxvec.astype(np.int_).flatten()]
     return (soft_expected == soft_produced).all()
 
 
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
index aaeb3ab920d1d8fae79c1173582d18cf81d03063..1f77276d5a72e5f886d5f94af8d35121ccadd486 100644
--- a/src/finn/util/vcd.py
+++ b/src/finn/util/vcd.py
@@ -101,19 +101,21 @@ def get_stream_if_stats(vcd_file, if_base_name):
     <stream_state>: (<num_samples>, <fraction_of_time>),
 
     where <stream_state> is the combination of (V)alid/(R)eady values,
-    <num_samples> is the approximate number of rising clock edges spent in <state>
-    , and <fraction_of_time> is the fraction of <num_samples> to total
+    <num_samples> is the approximate number of rising clock edges spent in <state>,
+    and <fraction_of_time> is the fraction of <num_samples> to total
     amount of time recorded by the trace.
 
     Example:
-    {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
-     "{'V': 1, 'R': 0}": (0, 0.0),
-     "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
-     "{'V': 1, 'R': 1}": (640, 0.07757575757575758)}
-
+    {
+    "{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
+    "{'V': 1, 'R': 0}": (0, 0.0),
+    "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
+    "{'V': 1, 'R': 1}": (640, 0.07757575757575758)
+    }
     Here we can see the stream was transmitting values 7.7% of the time,
     and 9.2% of the time there was no incoming data (valid 0, ready 1)
     """
+
     if_valid = if_base_name + vname
     if_ready = if_base_name + rname
     v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True)
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 5fd1439bd055782692bac404622137e166ef5e07..116df98d17985951892cf7f43baf486ab4fd80c8 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -49,14 +49,19 @@ def fetch_test_model(topology, wbits=2, abits=2):
 
 @pytest.mark.slow
 @pytest.mark.vivado
-@pytest.mark.fpgadataflow
-def test_fifosizing_linear():
+@pytest.mark.parametrize(
+    "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
+)
+def test_fifosizing_linear(method):
+    force_python_rtlsim = "python" in method
+    method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
     tmp_output_dir = fetch_test_model("tfc")
     cfg = build_cfg.DataflowBuildConfig(
         output_dir=tmp_output_dir,
         auto_fifo_depths=True,
-        auto_fifo_strategy="characterize",
+        auto_fifo_strategy=method_key,
         target_fps=10000,
+        force_python_rtlsim=force_python_rtlsim,
         synth_clk_period_ns=10.0,
         board="Pynq-Z1",
         rtlsim_batch_size=100,