diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 4374111f22a12e586c5c5233a7eee096b848b86e..00c25a4a3150a8368405b449fdce04456ccbe88d 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -1,17 +1,18 @@
 name: DockerImage
 
 on:
+  pull_request:
+    branches: [ dev ]
   push:
-    branches:
-      - 'dev'
+    branches: [ dev ]
 
 jobs:
   docker:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
       -
         name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v1
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 20f5b48f7acc65ab18702ef2509e9791f919b825..5f03379bbc37ab913f712571c630035dbad84cce 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,9 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Setup Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
 
       - name: Run Lint
         uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index ec92c84665d868b8a4376c82ecdf72395f1367a8..e2ba47ec296f73cfd7c0eede98bac3acd066075a 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -11,11 +11,11 @@ jobs:
 
   test:
     name: Run quicktest on PR branch
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
 
     steps:
       - name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: DockerRunQuicktest
         run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dfc83ba618eb905fe5579231542d14d529503ac2..5a7f70f8f69293d8dcef9b64c763aa606d5d73f5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -61,7 +61,7 @@ repos:
   - id: black
     language_version: python3
 
-- repo: https://gitlab.com/pycqa/flake8
+- repo: https://github.com/PyCQA/flake8
   rev: 3.9.2
   hooks:
   - id: flake8
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index b3c669ec1097745bd30f650ca0b9dacda647c61d..dbafba247679895bcbaf385f0d33946c3f810945 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -84,7 +84,7 @@ RUN rm requirements.txt
 # extra Python package dependencies (for testing and interaction)
 RUN pip install pygments==2.4.1
 RUN pip install ipykernel==5.5.5
-RUN pip install jupyter==1.0.0
+RUN pip install jupyter==1.0.0 --ignore-installed
 RUN pip install markupsafe==2.0.1
 RUN pip install matplotlib==3.3.1 --ignore-installed
 RUN pip install pytest-dependency==0.5.1
diff --git a/docs/finn/source_code/finn.analysis.fpgadataflow.rst b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
index b52e994ee6033d4c3c1aae6400e20e103455d7b6..57472cb670b6fa6cb95e6c137458d3a522f82f5a 100644
--- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
@@ -30,6 +30,7 @@ finn.analysis.fpgadataflow.floorplan\_params
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
 -------------------------------------------------------------
 
@@ -38,14 +39,15 @@ finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
    :undoc-members:
    :show-inheritance:
 
- finn.analysis.fpgadataflow.op\_and\_param\_counts
- --------------------------------------------------
+finn.analysis.fpgadataflow.op\_and\_param\_counts
+--------------------------------------------------
 
- .. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
+.. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
     :members:
     :undoc-members:
     :show-inheritance:
 
+
 finn.analysis.fpgadataflow.post\_synth\_res
 --------------------------------------------------
 
@@ -54,6 +56,7 @@ finn.analysis.fpgadataflow.post\_synth\_res
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.res\_estimation
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 4e3de458e153871d1d5969442af5940dc1673ecd..afa1ecffa08213db6a282076c6fdf59694f9e13e 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -37,6 +37,15 @@ qonnx.core.modelwrapper
    :undoc-members:
    :show-inheritance:
 
+qonnx.core.onnx\_exec
+---------------------------
+
+.. automodule:: qonnx.core.onnx_exec
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.core.onnx\_exec
 ---------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index cc56ea603e589d7000fe5b2b2943e67cdb90c884..fdcf44c6d99561658b727dc64c0a1b98b247c7df 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -8,7 +8,7 @@ HLS Custom Op Nodes
 Base Class
 ----------
 
-.. automodule:: finn.custom_op.fpgadataflow
+.. automodule:: finn.custom_op.fpgadataflow.hlscustomop
    :members:
    :undoc-members:
    :show-inheritance:
@@ -29,9 +29,25 @@ finn.custom\_op.fpgadataflow.channelwise\_op\_batch
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.checksum
+--------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.checksum
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.concat
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.concat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 
 finn.custom\_op.fpgadataflow.convolutioninputgenerator
--------------------------------------------------------------
+--------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator
    :members:
@@ -46,6 +62,15 @@ finn.custom\_op.fpgadataflow.convolutioninputgenerator1d
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl
+------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.downsampler
 -----------------------------------------
 
@@ -62,6 +87,16 @@ finn.custom\_op.fpgadataflow.duplicatestreams\_batch
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.eltwise
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.eltwise
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.fmpadding\_batch
 -----------------------------------------------
 
@@ -79,7 +114,7 @@ finn.custom\_op.fpgadataflow.globalaccpool\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.iodma
------------------------------------------------
+------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.iodma
    :members:
@@ -102,6 +137,15 @@ finn.custom\_op.fpgadataflow.lookup
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.matrixvectoractivation
+-----------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.pool\_batch
 -----------------------------------------------
 
@@ -127,14 +171,6 @@ finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.fpgadataflow.matrixvectoractivation
------------------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.custom\_op.fpgadataflow.streamingfifo
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 20d90a7bb596d6ce5638d9b2d9bae8a5c7e5c723..cdbe957c713ef6916e4ed7baabe09135f71fdeef 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -9,6 +9,7 @@ Submodules
    :maxdepth: 2
 
    finn.custom_op.fpgadataflow
+   qonnx.custom_op.channels_last
    qonnx.custom_op.general
 
 Custom Op Nodes
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index b1e7075bdcfb675a894f3e66b61d59117e4f078d..9f8ec079309f16daa022e14317ebddfd7758d639 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -62,6 +62,14 @@ finn.transformation.fpgadataflow.create\_stitched\_ip
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.derive\_characteristic
+------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.derive_characteristic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.fpgadataflow.externalize\_params
 ------------------------------------------------------------
 
@@ -103,6 +111,17 @@ finn.transformation.fpgadataflow.insert\_fifo
    :undoc-members:
    :show-inheritance:
 
+
+finn.transformation.fpgadataflow.insert\_hook
+----------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.insert_hook
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+
 finn.transformation.fpgadataflow.insert\_iodma
 ----------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index 6a28eeedb2aa547ba80677864ae9fb8c6aa64097..f42b595a50ec90ef055e2818d66f4b2410c25594 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -20,7 +20,7 @@ Transformation Passes
 Base Class
 ----------
 
-.. automodule:: finn.transformation
+.. automodule:: qonnx.transformation.base
    :members:
    :undoc-members:
    :show-inheritance:
@@ -42,7 +42,7 @@ qonnx.transformation.bipolar\_to\_xnor
    :show-inheritance:
 
 qonnx.transformation.change\_3d\_tensors\_to\_4d
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.change_3d_tensors_to_4d
   :members:
@@ -57,8 +57,18 @@ qonnx.transformation.change\_datalayout
   :undoc-members:
   :show-inheritance:
 
+
+qonnx.transformation.channels\_last
+--------------------------------------------
+
+.. automodule:: qonnx.transformation.channels_last
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 qonnx.transformation.create\_generic\_partitions
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.create_generic_partitions
   :members:
@@ -171,13 +181,22 @@ qonnx.transformation.merge\_onnx\_models
   :show-inheritance:
 
 
-finn.transformation.move\_reshape
+qonnx.transformation.quant\_constant\_folding
+----------------------------------------------
+
+.. automodule:: qonnx.transformation.quant_constant_folding
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
+qonnx.transformation.rebalance\_conv
 ----------------------------------------
 
-.. automodule:: finn.transformation.move_reshape
-   :members:
-   :undoc-members:
-   :show-inheritance:
+.. automodule:: qonnx.transformation.rebalance_conv
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
 qonnx.transformation.remove
 -------------------------------------
@@ -186,3 +205,12 @@ qonnx.transformation.remove
   :members:
   :undoc-members:
   :show-inheritance:
+
+
+finn.transformation.move\_reshape
+----------------------------------------
+
+.. automodule:: finn.transformation.move_reshape
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 8dffa016327c3bbe50f21278c859c83556b2b213..7ba3b252abfa0086a8c0281eb9a792fb239d6ec3 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -14,6 +14,15 @@ qonnx.util.basic
    :show-inheritance:
 
 
+qonnx.util.cleanup
+----------------------
+
+.. automodule:: qonnx.util.cleanup
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 qonnx.util.config
 --------------------
 
@@ -22,6 +31,40 @@ qonnx.util.config
   :undoc-members:
   :show-inheritance:
 
+qonnx.util.exec\_qonnx
+----------------------
+
+.. automodule:: qonnx.util.exec_qonnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.inference\_cost
+--------------------------
+
+.. automodule:: qonnx.util.inference_cost
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.onnx
+-------------------
+
+.. automodule:: qonnx.util.onnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.util.to\_channels\_last
+------------------------------
+
+.. automodule:: qonnx.util.to_channels_last
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.util.basic
 ----------------------
 
@@ -64,6 +107,15 @@ finn.util.gdrive
   :undoc-members:
   :show-inheritance:
 
+finn.util.hls
+---------------
+
+.. automodule:: finn.util.hls
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.util.imagenet
 -----------------------------
 
@@ -72,14 +124,6 @@ finn.util.imagenet
   :undoc-members:
   :show-inheritance:
 
-qonnx.util.onnx
----------------------
-
-.. automodule:: qonnx.util.onnx
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.util.platforms
 --------------------
 
diff --git a/docs/finn/source_code/modules.rst b/docs/finn/source_code/modules.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/finn/source_code/qonnx.custom_op.channels_last.rst b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ad10d94a6b34a99e2213994a75b0f063fd3d36f
--- /dev/null
+++ b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
@@ -0,0 +1,41 @@
+**************************
+Custom Op - Channels Last
+**************************
+
+Channels Last Custom Ops
+=========================
+
+qonnx.custom\_op.channels\_last.base\_wrapped\_op
+--------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.base_wrapped_op
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.batch\_normalization
+------------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.batch_normalization
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.conv
+--------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.conv
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.max\_pool
+------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.max_pool
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/fetch-repos.sh b/fetch-repos.sh
index b0f6400ed142b203b1c9f6d7ea4ac6ababcf34d1..5e668e04499fcf825382dc2785a92dc01c0e7d88 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="f702b17cdb9d5e57f85f43a5d33890647e063de6"
+QONNX_COMMIT="7d50273a4dcccb445fb06f57f6bedc17b3707b35"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
 PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index a2747e3921dc8e5a8427b4d5d9b7f143a57b018f..28155d6f3eacd4dfd77aefbc73fc4ed3ef12f1dd 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -359,21 +359,21 @@
     "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n",
     "folding = [\n",
-    "    (16, 3, 128),\n",
-    "    (32, 32, 128),\n",
-    "    (16, 32, 128),\n",
-    "    (16, 32, 128),\n",
-    "    (4, 32, 81),\n",
-    "    (1, 32, 2),\n",
-    "    (1, 4, 2),\n",
-    "    (1, 8, 128),\n",
-    "    (5, 1, 3),\n",
+    "    (16, 3, [128]),\n",
+    "    (32, 32, [128]),\n",
+    "    (16, 32, [128]),\n",
+    "    (16, 32, [128]),\n",
+    "    (4, 32, [81]),\n",
+    "    (1, 32, [2]),\n",
+    "    (1, 4, [2]),\n",
+    "    (1, 8, [128]),\n",
+    "    (5, 1, [3]),\n",
     "]\n",
     "for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):\n",
     "    fcl_inst = getCustomOp(fcl)\n",
     "    fcl_inst.set_nodeattr(\"PE\", pe)\n",
     "    fcl_inst.set_nodeattr(\"SIMD\", simd)\n",
-    "    fcl_inst.set_nodeattr(\"inFIFODepth\", ififodepth)\n",
+    "    fcl_inst.set_nodeattr(\"inFIFODepths\", ififodepth)\n",
     "\n",
     "# use same SIMD values for the sliding window operators\n",
     "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator\")\n",
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index a6f05df30925250df1704afb6f9ff9dc7dc17dc0..c4fc92b97c91d6b1dfadc41ac3c23d014bd9fada 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -559,17 +559,17 @@
     "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n",
     "config = [\n",
-    "    (16, 49, 16, 64, \"block\"),\n",
-    "    (8, 8, 64, 64, \"auto\"),\n",
-    "    (8, 8, 64, 64, \"auto\"),\n",
-    "    (10, 8, 64, 10, \"distributed\"),\n",
+    "    (16, 49, [16], [64], \"block\"),\n",
+    "    (8, 8, [64], [64], \"auto\"),\n",
+    "    (8, 8, [64], [64], \"auto\"),\n",
+    "    (10, 8, [64], [10], \"distributed\"),\n",
     "]\n",
     "for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):\n",
     "    fcl_inst = getCustomOp(fcl)\n",
     "    fcl_inst.set_nodeattr(\"PE\", pe)\n",
     "    fcl_inst.set_nodeattr(\"SIMD\", simd)\n",
-    "    fcl_inst.set_nodeattr(\"inFIFODepth\", ififo)\n",
-    "    fcl_inst.set_nodeattr(\"outFIFODepth\", ofifo)\n",
+    "    fcl_inst.set_nodeattr(\"inFIFODepths\", ififo)\n",
+    "    fcl_inst.set_nodeattr(\"outFIFODepths\", ofifo)\n",
     "    fcl_inst.set_nodeattr(\"ram_style\", ramstyle)\n",
     "    \n",
     "# set parallelism for input quantizer to be same as first layer's SIMD\n",
@@ -590,7 +590,7 @@
    "metadata": {},
    "source": [
     "Besides PE and SIMD three other node attributes are set. `ram_style` specifies how the weights are to be stored (BRAM, LUTRAM, and so on). It can be selected explicitly or with the option `auto` you can let Vivado decide.\n",
-    "`inFIFODepth` and `outFIFODepth` specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.\n",
+    "`inFIFODepths` and `outFIFODepths` specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.\n",
     "\n",
     "In previous versions of FINN we had to call transformations to insert data width converters, FIFOs and `TLastMarker` manually at this step. This is no longer needed, as all this is taken care of by the `ZynqBuild` or `VitisBuild` transformations."
    ]
diff --git a/requirements.txt b/requirements.txt
index 9038a5e8170301421529e0b570482316e4fff20a..348b1afab9deca1547d40cb8d8c54a396befa65d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,6 @@ bitstring==3.1.7
 clize==4.1.1
 dataclasses-json==0.5.7
 docrep==0.2.7
-future==0.18.2
 gspread==3.6.0
 numpy==1.22.0
 onnx==1.11.0
@@ -10,6 +9,7 @@ onnxoptimizer
 onnxruntime==1.11.1
 pre-commit==2.9.2
 protobuf==3.20.2
+psutil==5.9.4
 pyscaffold==3.2.1
 scipy==1.5.2
 setupext-janitor>=1.1.2
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index d3c4156d9b4ccf601d3eea348f6cb61c0d9a6e87..a38cb6e572d683871a924330742a1859b6fbe75d 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -253,12 +253,20 @@ class DataflowBuildConfig:
     #: for each FIFO.
     auto_fifo_depths: Optional[bool] = True
 
+    #: Whether FIFO nodes with depth larger than 32768 will be split.
+    #: Allow to configure very large FIFOs in the folding_config_file.
+    split_large_fifos: Optional[bool] = False
+
     #: When `auto_fifo_depths = True`, select which method will be used for
     #: setting the FIFO sizes.
     auto_fifo_strategy: Optional[
         AutoFIFOSizingMethod
     ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
 
+    #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
+    #: if set to True, always using Python instead
+    force_python_rtlsim: Optional[bool] = False
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 5da608c27def8136f9ad11f62b4707452eac3120..e9ad39961410a283865f3e4520a21353fbdf1cae 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -30,6 +30,7 @@ import json
 import numpy as np
 import os
 import shutil
+import warnings
 from copy import deepcopy
 from distutils.dir_util import copy_tree
 from qonnx.core.modelwrapper import ModelWrapper
@@ -98,6 +99,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import (
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
+    SplitLargeFIFOs,
 )
 from finn.transformation.fpgadataflow.set_folding import SetFolding
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
@@ -113,6 +115,7 @@ from finn.util.basic import (
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
 )
+from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
 
 
@@ -531,11 +534,20 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
         elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
+            # multi-in/out streams currently not supported in our C++ verilator driver
+            model_multi_io = len(model.graph.input) > 1 or len(model.graph.output) > 1
+            force_python_sim = model_multi_io or cfg.force_python_rtlsim
+            if model_multi_io:
+                warnings.warn(
+                    "Multi-in/out streams currently not supported "
+                    + "in FINN C++ verilator driver, falling back to Python"
+                )
             model = model.transform(
                 InsertAndSetFIFODepths(
                     cfg._resolve_fpga_part(),
                     cfg._resolve_hls_clk_period(),
                     vivado_ram_style=cfg.large_fifo_mem_style,
+                    force_python_sim=force_python_sim,
                 )
             )
         else:
@@ -551,8 +563,6 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
             model = model.transform(ApplyConfig(cfg.folding_config_file))
-        # remove any shallow FIFOs
-        model = model.transform(RemoveShallowFIFOs())
 
     # extract the final configuration and save it as json
     hw_attrs = [
@@ -569,6 +579,13 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model, cfg.output_dir + "/final_hw_config.json", hw_attrs
     )
 
+    # perform FIFO splitting and shallow FIFO removal only after the final config
+    # json file has been written. otherwise, since these transforms may add/remove
+    # FIFOs, we get name mismatch problems when trying to reuse the final config.
+    if cfg.split_large_fifos:
+        model = model.transform(SplitLargeFIFOs())
+    model = model.transform(RemoveShallowFIFOs())
+
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
     model = model.transform(
@@ -632,20 +649,48 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
         # prepare ip-stitched rtlsim
         rtlsim_model = deepcopy(model)
         rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg)
-        # run with single input to get latency
-        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+        # multi-in/out streams currently not supported in our C++ verilator driver
+        model_multi_io = (
+            len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1
+        )
+        force_python_rtlsim = cfg.force_python_rtlsim or model_multi_io
+        if model_multi_io:
+            warnings.warn(
+                "Multi-in/out streams currently not supported "
+                + "in FINN C++ verilator driver, falling back to Python"
+            )
         rtlsim_bs = int(cfg.rtlsim_batch_size)
-        assert rtlsim_bs > 0, "rtlsim batch size must be >0"
-        if cfg.verify_save_rtlsim_waveforms:
-            # set depth to 3 for layer-by-layer visibility
-            os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+        if force_python_rtlsim:
+            # run with single input to get latency
+            orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+            assert rtlsim_bs > 0, "rtlsim batch size must be >0"
+            if cfg.verify_save_rtlsim_waveforms:
+                # set depth to 3 for layer-by-layer visibility
+                os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+                rtlsim_model.set_metadata_prop(
+                    "rtlsim_trace",
+                    "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs),
+                )
             rtlsim_model.set_metadata_prop(
-                "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs)
+                "extra_verilator_args", str(["-CFLAGS", "-O3"])
             )
-        rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
-        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
-        rtlsim_latency = rtlsim_perf_dict["cycles"]
-        rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+            rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
+            rtlsim_latency = rtlsim_perf_dict["cycles"]
+            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+        else:
+            rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
+            # keep keys consistent between the Python and C++-styles
+            cycles = rtlsim_perf_dict["cycles"]
+            clk_ns = float(model.get_metadata_prop("clk_ns"))
+            fclk_mhz = 1 / (clk_ns * 0.001)
+            runtime_s = (cycles * clk_ns) * (10**-9)
+            rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000
+            rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s
+            rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz
+            for (key, val) in rtlsim_perf_dict.items():
+                if "max_count" in key:
+                    del rtlsim_perf_dict[key]
+
         with open(report_dir + "/rtlsim_performance.json", "w") as f:
             json.dump(rtlsim_perf_dict, f, indent=2)
         if cfg.verify_save_rtlsim_waveforms:
diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py
index d6284750c73026c09fb7986ffc2517ed9ae3b153..68ed6546c741277bd8e962b6e80eda083cedba9c 100644
--- a/src/finn/custom_op/fpgadataflow/eltwise.py
+++ b/src/finn/custom_op/fpgadataflow/eltwise.py
@@ -398,7 +398,7 @@ class StreamingEltwise(HLSCustomOp):
                 "StreamingEltwise",
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PE"),
-                self.get_number_output_values(),
+                int(np.prod(self.get_folded_output_shape()[:-2])),
                 slice_in0,
                 slice_in1,
                 slice_out,
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index f307be95c30d822dfc517e4c331bd8d82d727997..d1326607aa0dc5c34eef105b2ceb8ed86c1a0458 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -43,6 +43,7 @@ from finn.util.basic import (
     pyverilate_get_liveness_threshold_cycles,
 )
 from finn.util.hls import CallHLS
+from finn.util.pyverilator import make_single_source_file
 
 from . import templates
 
@@ -174,7 +175,7 @@ class HLSCustomOp(CustomOp):
         # default impl only returns the HLS verilog codegen dir
         return [verilog_path]
 
-    def get_all_verilog_filenames(self):
+    def get_all_verilog_filenames(self, abspath=False):
         "Return list of all Verilog files used for this node."
 
         verilog_files = []
@@ -182,7 +183,10 @@ class HLSCustomOp(CustomOp):
         for verilog_path in verilog_paths:
             for f in os.listdir(verilog_path):
                 if f.endswith(".v"):
-                    verilog_files += [f]
+                    if abspath:
+                        verilog_files += [verilog_path + "/" + f]
+                    else:
+                        verilog_files += [f]
         return verilog_files
 
     def prepare_rtlsim(self):
@@ -192,13 +196,18 @@ class HLSCustomOp(CustomOp):
 
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
-        verilog_paths = self.get_all_verilog_paths()
-        verilog_files = self.get_all_verilog_filenames()
+
+        verilog_files = self.get_all_verilog_filenames(abspath=True)
+        single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
+        tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
+        target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v"
+        make_single_source_file(verilog_files, target_file)
+
         # build the Verilator emu library
         sim = PyVerilator.build(
-            verilog_files,
-            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=verilog_paths,
+            self.get_verilog_top_module_name() + ".v",
+            build_dir=tmp_build_dir,
+            verilog_path=[single_src_dir],
             trace_depth=get_rtlsim_trace_depth(),
             top_module_name=self.get_verilog_top_module_name(),
         )
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 69763fbea8a6079c7b0a61e14da37a3af69dfdfb..72128fda4cfe23db4858fe3ffe80a755733954cc 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -576,6 +576,10 @@ class MatrixVectorActivation(HLSCustomOp):
 
     def minimize_accumulator_width(self, model):
         weights = model.get_initializer(self.onnx_node.input[1])
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
         else:
@@ -702,10 +706,12 @@ class MatrixVectorActivation(HLSCustomOp):
         of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -1227,20 +1233,6 @@ class MatrixVectorActivation(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
         )
-        # TODO can we deprecate this entirely? this looks like legacy code
-        # that does not really serve a purpose - FIFO sizes are not typically
-        # allocated at this point; at best they are set to 2 as the default
-        in_fifo_depth = 2
-        out_fifo_depth = 2
-        # insert depth pragmas only if specified
-        if in_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
-            )
-        if out_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
-            )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index 91cd537baeff0c7666bbf3596b46a7412ec2fe4e..813f13e504eae181f4398eccbe40ad66b6e3bf16 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -42,12 +42,13 @@ class Pool_Batch(HLSCustomOp):
     Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
 
     Notes:
-    # The input shape was chosen to be compatible with im2col (only true when there
-    is not folding).
 
-    # The actual data layout produced by the hlslib kernels is different
-    for depthwise ops.
-     * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
+    * The input shape was chosen to be compatible with im2col (only true when there
+      is not folding).
+    * The actual data layout produced by the hlslib kernels is different
+      for depthwise ops.
+
+        * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
 
     Channels can be folded using PE (SIMD from the input perspective)
     """
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 40d016de43820a37e8c7894a3e1f30146c667e59..c71e8ffe323b1f2bb459a0f982e63d881a7ae58d 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -46,32 +46,34 @@ class StreamingFIFO(HLSCustomOp):
         self.strm_fifo_wrapper = templates.strm_fifo_wrapper
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            # FIFO depth
-            "depth": ("i", True, 0),
-            # folded shape of input/output
-            "folded_shape": ("ints", True, []),
-            # FINN DataTypes for inputs/outputs
-            "dataType": ("s", True, ""),
-            # Toggle between hls or IPI implementation
-            # rtl - use the hls generated IP during stitching
-            # vivado - use the AXI Infrastructure FIFO
-            "impl_style": ("s", False, "rtl", {"rtl", "vivado"}),
-            # FPGA resource type for FIFOs when impl_style is vivado
-            # auto -- let Vivado decide
-            # block -- use BRAM
-            # distributed -- use LUTRAM
-            # ultra -- use URAM (on UltraScale+)
-            "ram_style": (
-                "s",
-                False,
-                "auto",
-                {"auto", "block", "distributed", "ultra"},
-            ),
-            # whether depth monitoring is enabled (impl_style=rtl only)
-            "depth_monitor": ("i", False, 0),
-        }
-        my_attrs.update(super().get_nodeattr_types())
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                # FIFO depth
+                "depth": ("i", True, 0),
+                # folded shape of input/output
+                "folded_shape": ("ints", True, []),
+                # FINN DataTypes for inputs/outputs
+                "dataType": ("s", True, ""),
+                # Toggle between hls or IPI implementation
+                # rtl - use the hls generated IP during stitching
+                # vivado - use the AXI Infrastructure FIFO
+                "impl_style": ("s", False, "rtl", {"rtl", "vivado"}),
+                # FPGA resource type for FIFOs when impl_style is vivado
+                # auto -- let Vivado decide
+                # block -- use BRAM
+                # distributed -- use LUTRAM
+                # ultra -- use URAM (on UltraScale+)
+                "ram_style": (
+                    "s",
+                    False,
+                    "auto",
+                    {"auto", "block", "distributed", "ultra"},
+                ),
+                # whether depth monitoring is enabled (impl_style=rtl only)
+                "depth_monitor": ("i", False, 0),
+            }
+        )
 
         return my_attrs
 
@@ -256,6 +258,12 @@ class StreamingFIFO(HLSCustomOp):
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
+    def get_input_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("dataType")]
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index f2cc64668d62ef15446772309577e9b15a378ef5..d9745acf63c4685b3369ac379abde0a6c5a3f157 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -354,10 +354,12 @@ class Thresholding_Batch(HLSCustomOp):
         run-time reconfig of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
         tdt = self.get_weight_datatype()
@@ -600,13 +602,17 @@ class Thresholding_Batch(HLSCustomOp):
 
     # TODO check and add whatever missing
     def defines(self, var):
+        numReps = 1
         numInputVectors = list(self.get_nodeattr("numInputVectors"))
-        numReps = int(np.prod(numInputVectors))
+        total_spatial_size = int(np.prod(numInputVectors))
+
         self.code_gen_dict["$DEFINES$"] = [
-            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format(
+            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}\n
+               #define ImgDim1 {}""".format(
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PE"),
                 numReps,
+                total_spatial_size,
             )
         ]
         if self.get_nodeattr("mem_mode") == "decoupled":
@@ -647,7 +653,7 @@ class Thresholding_Batch(HLSCustomOp):
             npy_in = "%s/thresholds.npy" % code_gen_dir
 
             self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, ImgDim1);'
                 % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
             )
 
@@ -669,18 +675,13 @@ class Thresholding_Batch(HLSCustomOp):
 
     def docompute(self):
         tmpl_args = self.get_template_param_values()
-        # TODO: why put some template parameters into defines and not others?
-        # should ImgDim be defined or just filled in here like we do now?
         node = self.onnx_node
-        inp_vecs = self.get_nodeattr("numInputVectors")
-        total_spatial_size = int(np.prod(inp_vecs))
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}>
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}>
                 (in0, out, threshs, numReps);""".format(
                     node.op_type,
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
@@ -690,10 +691,9 @@ class Thresholding_Batch(HLSCustomOp):
             # - for cppsim the repetition comes from the threshold stream reader+input
             # - for synth the unit runs continuously anyway (ap_ctrl_none)
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
-                (in0, out, weights, 1);""".format(
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                (in0, out, weights, numReps);""".format(
                     "Thresholding_Stream_Batch",
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index 0375bdea68f6c10eda8a3c5f375bbb14bc9a2be5..d5e29ca22acf89440c3c3a66101bec89d4a66d46 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -410,10 +410,12 @@ class VectorVectorActivation(HLSCustomOp):
         of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -901,20 +903,6 @@ class VectorVectorActivation(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
         )
-        # TODO can we deprecate this entirely? this looks like legacy code
-        # that does not really serve a purpose - FIFO sizes are not typically
-        # allocated at this point; at best they are set to 2 as the default
-        in_fifo_depth = 2
-        out_fifo_depth = 2
-        # insert depth pragmas only if specified
-        if in_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
-            )
-        if out_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
-            )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/qnn-data/cpp/verilator_fifosim.cpp b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0aca9efe77806d31192f35a1d751b32116218f8
--- /dev/null
+++ b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
@@ -0,0 +1,197 @@
+/* Copyright (C) 2022, Advanced Micro Devices, Inc.
+All rights reserved.
+#
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+#
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+#
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+#
+* Neither the name of FINN nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+#
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+#include <iostream>
+#include <fstream>
+#include <cstddef>
+#include <chrono>
+#include "verilated.h"
+#include "verilated_vcd_c.h"
+#include "Vfinn_design_wrapper.h"
+
+#ifdef DEBUG
+#define TRACE(x) x
+#else
+#define TRACE(x) ;
+#endif
+
+using namespace std;
+
+Vfinn_design_wrapper * top;
+
+// code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+// this is required by verilator for verilog designs using $time
+// main_time is incremented in eval
+double main_time = 0;
+
+double sc_time_stamp() {
+return main_time;
+}
+// function definitions
+// helper functions for basic verilator tasks
+extern "C" { //Open an extern C closed below
+Vfinn_design_wrapper* construct() {
+    Verilated::commandArgs(0, (const char**) nullptr);
+    TRACE(Verilated::traceEverOn(true));
+    Vfinn_design_wrapper* top = new Vfinn_design_wrapper();
+    return top;
+}
+int eval(Vfinn_design_wrapper* top) {
+    top->eval();
+    main_time++;
+    return 0;
+}
+int destruct(Vfinn_design_wrapper* top) {
+    if (top != nullptr) {
+        delete top;
+        top = nullptr;
+    }
+    return 0;
+}
+
+TRACE(
+VerilatedVcdC* tfp;
+VerilatedVcdC* start_vcd_trace(Vfinn_design_wrapper* top, const char* filename) {
+    VerilatedVcdC* tfp = new VerilatedVcdC;
+    top->trace(tfp, 99);
+    tfp->open(filename);
+    return tfp;
+}
+int add_to_vcd_trace(VerilatedVcdC* tfp, int time) {
+    tfp->dump(time);
+    return 0;
+}
+int flush_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->flush();
+    return 0;
+}
+int stop_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->close();
+    return 0;
+}
+)
+
+}
+
+// end of code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+inline void toggle_clk() {
+    eval(top);
+    top->ap_clk = 1;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+    eval(top);
+    top->ap_clk = 0;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+}
+
+
+void reset() {
+    top->ap_rst_n = 0;
+    for(unsigned i = 0; i < 10; i++) {
+        toggle_clk();
+    }
+    top->ap_rst_n = 1;
+}
+
+int main(int argc, char *argv[]) {
+    top = construct();
+    TRACE(tfp = start_vcd_trace(top, "trace.vcd"));
+    unsigned n_iters_per_input = @ITERS_PER_INPUT@;
+    unsigned n_iters_per_output = @ITERS_PER_OUTPUT@;
+    unsigned n_inputs = @N_INPUTS@;
+    unsigned max_iters = @MAX_ITERS@;
+
+    reset();
+
+    top->m_axis_0_tready = 1;
+    top->s_axis_0_tvalid = 1;
+
+    unsigned n_in_txns = 0, n_out_txns = 0, iters = 0, last_output_at = 0;
+    unsigned latency = 0;
+
+    bool exit_criterion = false;
+
+    cout << "Simulation starting" << endl;
+    cout << "Number of inputs to write " << n_iters_per_input * n_inputs << endl;
+    cout << "Number of outputs to expect " << n_iters_per_output * n_inputs << endl;
+    cout << "No-output timeout clock cycles " << max_iters << endl;
+
+    chrono::steady_clock::time_point begin = chrono::steady_clock::now();
+
+    while(!exit_criterion) {
+        toggle_clk();
+        iters++;
+        if(iters % 1000 == 0) {
+            cout << "Elapsed iters " << iters << " inps " << n_in_txns << " outs " << n_out_txns << endl;
+            chrono::steady_clock::time_point end = chrono::steady_clock::now();
+            cout << "Elapsed since last report = " << chrono::duration_cast<chrono::seconds>(end - begin).count() << "[s]" << endl;
+            begin = end;
+        }
+        if(top->s_axis_0_tready == 1 && top->s_axis_0_tvalid == 1) {
+            n_in_txns++;
+            if(n_in_txns == n_iters_per_input * n_inputs) {
+                top->s_axis_0_tvalid = 0;
+                cout << "All inputs written at cycle " << iters << endl;
+            }
+        }
+        if(top->m_axis_0_tvalid == 1) {
+            n_out_txns++;
+            last_output_at = iters;
+            if(n_out_txns == n_iters_per_output) {
+                latency = iters;
+            }
+        }
+
+        exit_criterion = ((n_in_txns >= n_iters_per_input * n_inputs) && (n_out_txns >= n_iters_per_output * n_inputs)) || ((iters-last_output_at) > max_iters);
+    }
+
+    TRACE(flush_vcd_trace(tfp));
+    TRACE(stop_vcd_trace(tfp));
+
+    cout << "Simulation finished" << endl;
+    cout << "Number of inputs consumed " << n_in_txns << endl;
+    cout << "Number of outputs produced " << n_out_txns << endl;
+    cout << "Number of clock cycles " << iters << endl;
+
+    ofstream results_file;
+    results_file.open("results.txt", ios::out | ios::trunc);
+    results_file << "N_IN_TXNS" << "\t" << n_in_txns << endl;
+    results_file << "N_OUT_TXNS" << "\t" << n_out_txns << endl;
+    results_file << "cycles" << "\t" << iters << endl;
+    results_file << "N" << "\t" << n_inputs << endl;
+    results_file << "latency_cycles" << "\t" << latency << endl;
+@FIFO_DEPTH_LOGGING@
+    results_file.close();
+
+
+
+    destruct(top);
+
+    return 0;
+}
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 7e4ab34af79c52a08e737f57b2fc8f017940bcf5..525af7ea920e1c8809ce9cd53e628dd756cfdad4 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -1165,10 +1165,16 @@ class InferAddStreamsLayer(Transformation):
                 result = node.output[0]
                 in0_shape = model.get_tensor_shape(in0)
                 in1_shape = model.get_tensor_shape(in1)
+                in0_static = not (model.get_initializer(in0) is None)
+                in1_static = not (model.get_initializer(in1) is None)
 
                 # skip if different shapes on inputs
                 if in0_shape != in1_shape:
                     continue
+                # skip if any of inputs have initializers
+                # (this node is meant for adding two dynamic streams)
+                if in0_static or in1_static:
+                    continue
 
                 idt0 = model.get_tensor_datatype(in0)
                 idt1 = model.get_tensor_datatype(in1)
@@ -1694,6 +1700,10 @@ class InferConcatLayer(Transformation):
                 )
                 if not dt_coherent:
                     continue
+                # skip conversion if any inputs are static
+                all_static = all([model.get_initializer(x) is None for x in node.input])
+                if not all_static:
+                    continue
                 # skip conversion if inputs are not integers
                 if not dt0.is_integer():
                     continue
@@ -1739,10 +1749,16 @@ class InferStreamingEltwise(Transformation):
                 result = node.output[0]
                 in0_shape = model.get_tensor_shape(in0)
                 in1_shape = model.get_tensor_shape(in1)
+                in0_static = not (model.get_initializer(in0) is None)
+                in1_static = not (model.get_initializer(in1) is None)
 
                 # skip if different shapes on inputs
                 if in0_shape != in1_shape:
                     continue
+                # skip if any of inputs have initializers
+                # (this node is meant for two dynamic streams)
+                if in0_static or in1_static:
+                    continue
 
                 idt0 = model.get_tensor_datatype(in0)
                 idt1 = model.get_tensor_datatype(in1)
diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py
index 822679721036c7832241db4642911ff804fb9dff..f783f7ae711739cf4e011315c6714ad95d3c7919 100644
--- a/src/finn/transformation/fpgadataflow/derive_characteristic.py
+++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py
@@ -127,7 +127,7 @@ class DeriveCharacteristic(NodeLocalTransformation):
 class DeriveFIFOSizes(NodeLocalTransformation):
     """Prerequisite: DeriveCharacteristic already called on graph.
     For each node in the graph, use the accumulated I/O characteristic function
-    to perform FIFO sizing, setting the in/outFIFODepth attributes of HLSCustomOp
+    to perform FIFO sizing, setting the in/outFIFODepths attributes of HLSCustomOp
     nodes.
 
     * num_workers (int or None) number of parallel workers, see documentation in
@@ -178,7 +178,7 @@ class DeriveFIFOSizes(NodeLocalTransformation):
                     fifo_depth = int((prod_chrc_part - cons_chrc_part).max())
                     out_fifo_depths.append(fifo_depth)
                 # set output FIFO depth for this (producing) node
-                # InsertFIFO looks at the max of (outFIFODepth, inFIFODepth)
+                # InsertFIFO looks at the max of (outFIFODepths, inFIFODepths)
                 # for each tensor
                 prod.set_nodeattr("outFIFODepths", out_fifo_depths)
 
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index 67920172231e685a4f5dd72f037f64fe6baf8449..549b94d9f287721aac26afd4d4d832e48adadb84 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -151,6 +151,7 @@ class Floorplan(Transformation):
                 node_inst.set_nodeattr("partition_id", partition_cnt)
                 partition_cnt += 1
                 continue
+
             elif not (
                 node.op_type == "MatrixVectorActivation"
                 and node_inst.get_nodeattr("mem_mode") is not None
@@ -165,9 +166,17 @@ class Floorplan(Transformation):
                 pre_inst = getCustomOp(pre_node)
                 pre_slr = pre_inst.get_nodeattr("slr")
                 if node_slr == pre_slr:
-                    partition_id = pre_inst.get_nodeattr("partition_id")
-                    node_inst.set_nodeattr("partition_id", partition_id)
-                    break
+                    axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()[
+                        "axilite"
+                    ]
+                    if len(axilite_intf_name) != 0:
+                        node_inst.set_nodeattr("partition_id", partition_cnt)
+                        partition_cnt += 1
+                    else:
+                        partition_id = pre_inst.get_nodeattr("partition_id")
+                        node_inst.set_nodeattr("partition_id", partition_id)
+                break
+
             else:
                 # no matching, new partition
                 node_inst.set_nodeattr("partition_id", partition_cnt)
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
index 1fede0667888ee9059cfb2e7f5db00b6bb3f4259..c091dbd5edc675234686b28048c004b26c3fc131 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
@@ -64,7 +64,11 @@ class HLSSynthIP(NodeLocalTransformation):
                 ), """Node
                 attribute "code_gen_dir_ipgen" is empty. Please run
                 transformation PrepareIP first."""
-                if not os.path.isdir(inst.get_nodeattr("ipgen_path")):
+                if not os.path.isdir(
+                    inst.get_nodeattr("ipgen_path")
+                ) or not inst.get_nodeattr("code_gen_dir_ipgen") in inst.get_nodeattr(
+                    "ipgen_path"
+                ):
                     # call the compilation function for this node
                     inst.ipgen_singlenode_code()
                 else:
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 79bd717a5d96e7a9839740d73254db53e5133e13..0546643d1220603d40651c45a0c4032dcf5cfaaf 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -67,17 +67,19 @@ class InsertFIFO(Transformation):
     between fpgadataflow nodes.
 
     Takes the setting for the depth from the surrounding nodes by extracting
-    node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth'
+    node attribute 'outFIFODepths' of the previous and node attribute 'inFIFODepths'
     of the subsequent node. max() of these two values sets the FIFO depth.
 
     Constructor arguments:
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
-    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado
-    - create_shallow_fifos : Normally, shallow-depth (<=2) FIFOs won't be created since
-                            HLS streaming interfaces already have a degree of buffering.
-                            Override with this parameter.
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute
+        to be used for large FIFOs implemented by Vivado
+    :parameter create_shallow_fifos: Normally, shallow-depth (<=2) FIFOs
+        won't be created since HLS streaming interfaces
+        already have a degree of buffering.
+        Override with this parameter.
 
 
     The other node attributes necessary to create a FIFO node are taken from the
@@ -128,8 +130,8 @@ class InsertFIFO(Transformation):
                         folded output shape of the second node. A streaming fifo can't
                         be implemented in between these nodes."""
 
-                        # check if outFIFOdepth attribute of first node
-                        # and inFIFOdepth attribute of consumer node is equal
+                        # check if outFIFOdepths attribute of first node
+                        # and inFIFOdepths attribute of consumer node is equal
                         n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out]
                         n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp]
 
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index f715aaeffb6d4d00f2e14c5fb25ec931443d5d97..9ac1000468d72c49a3d6d19556dd8b96fb5fe7a4 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -29,10 +29,16 @@
 import math
 import numpy as np
 import warnings
+from onnx import TensorProto, helper
 from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk
+from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+)
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
@@ -42,7 +48,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.util.pyverilator import pyverilate_stitched_ip
+from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim
 
 
 def reset_implementation(node):
@@ -72,8 +78,9 @@ def optimize_depth(depth):
         # Q_srl FIFOs do not benefit from size < 32
         # add some slack
         return 32
-    # round to nearest power of two for Vivado IP FIFO implementation
-    return int(2 ** math.ceil(math.log2(depth)))
+    # otherwise leave as is
+    # will be rounded to nearest power of two for Vivado-style FIFO
+    return int(depth)
 
 
 class RemoveShallowFIFOs(Transformation):
@@ -125,14 +132,17 @@ class CapConvolutionFIFODepths(Transformation):
     constructor flag is set.
 
     Constructor arguments:
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - FIFOs inserted with InsertAndSetFIFODepths
 
     Output:
+
     - graph with smaller-depth FIFOs for convolutions
 
     Background:
@@ -188,22 +198,25 @@ class InsertAndSetFIFODepths(Transformation):
     throughput in the created accelerator.
 
     Constructor arguments:
-    - clk_ns : clock period (used for IP preparation)
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
-    - max_depth : how deep the "max"-sized FIFOs initially inserted will be
-                   if set to None, use the tensor size as the depth
-    - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
-                        smaller where appropriate
-    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado afterwards
+
+    :parameter clk_ns: clock period (used for IP preparation)
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter max_depth: how deep the "max"-sized FIFOs initially inserted
+        will be. If set to None, use the tensor size as the depth
+    :parameter swg_exception: call CapConvolutionFIFODepths to make convolution FIFOs
+        smaller where appropriate
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute to be used
+        for large FIFOs implemented by Vivado afterwards
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - no FIFOs inserted,
-    - (inFIFODepth/outFIFODepth attrs will be ignored)
+    - (inFIFODepths/outFIFODepths attrs will be ignored)
 
     Output:
+
     - graph with appropriate-depth FIFOs inserted
 
     Background:
@@ -211,12 +224,14 @@ class InsertAndSetFIFODepths(Transformation):
     necessary to insert FIFOs between them to prevent stalls due to bursty
     behavior. The sizes of those FIFOs are hard to predict analytically, so
     we do the following:
+
     - insert deep (=tensor size) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
     - when sim finished, update each FIFO depth to maximum observed occupancy
-      and set inFIFODepth/outFIFODepth attrs to 0 on relevant nodes
+      and set inFIFODepths/outFIFODepths attrs to 0 on relevant nodes
+
     """
 
     def __init__(
@@ -227,6 +242,7 @@ class InsertAndSetFIFODepths(Transformation):
         max_depth=None,
         swg_exception=True,
         vivado_ram_style="auto",
+        force_python_sim=False,
     ):
         super().__init__()
         self.fpgapart = fpgapart
@@ -235,6 +251,7 @@ class InsertAndSetFIFODepths(Transformation):
         self.max_depth = max_depth
         self.swg_exception = swg_exception
         self.vivado_ram_style = vivado_ram_style
+        self.force_python_sim = force_python_sim
 
     def apply(self, model):
         # these optypes may potentially use external weights
@@ -306,57 +323,75 @@ class InsertAndSetFIFODepths(Transformation):
         model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
         model.set_metadata_prop("exec_mode", "rtlsim")
 
-        # calculate input frequency (number of cycles for each input word)
-        first_node = getCustomOp(model.graph.node[0])
-        ncycles_per_input = max(
-            1,
-            int(
-                math.ceil(
-                    perf["max_cycles"]
-                    / (
-                        np.prod(first_node.get_folded_input_shape())
-                        / first_node.get_folded_input_shape()[-1]
+        if self.force_python_sim:
+            # do rtlsim in Python for FIFO sizing
+            # calculate input frequency (number of cycles for each input word)
+            first_node = getCustomOp(model.graph.node[0])
+            ncycles_per_input = max(
+                1,
+                int(
+                    math.ceil(
+                        perf["max_cycles"]
+                        / (
+                            np.prod(first_node.get_folded_input_shape())
+                            / first_node.get_folded_input_shape()[-1]
+                        )
                     )
-                )
-            ),
-        )
+                ),
+            )
 
-        # set sufficiently large threshold for 1 image to  fully execute and exit
-        ncycles = int(latency + max_cycles)
+            # set sufficiently large threshold for 1 image to  fully execute and exit
+            ncycles = int(latency + max_cycles)
 
-        # prepare pyverilator model
-        sim = pyverilate_stitched_ip(model)
+            # prepare pyverilator model
+            sim = pyverilate_stitched_ip(model)
 
-        reset_rtlsim(sim)
-        toggle_clk(sim)
+            reset_rtlsim(sim)
+            toggle_clk(sim)
 
-        # set all input valids to 0 and output readies to 1
-        # set input data to some constant
-        set_signal(sim, "tvalid", 0)
-        set_signal(sim, "tready", 1)
-        set_signal(sim, "tdata", 0)
+            # set all input valids to 0 and output readies to 1
+            # set input data to some constant
+            set_signal(sim, "tvalid", 0)
+            set_signal(sim, "tready", 1)
+            set_signal(sim, "tdata", 0)
+
+            output_detected = False
+            while ncycles > 0:
+                toggle_clk(sim)
+                # set/unset valids
+                if ncycles % ncycles_per_input == 0:
+                    set_signal(sim, "tvalid", 1)
+                else:
+                    set_signal(sim, "tvalid", 0)
 
-        output_detected = False
-        while ncycles > 0:
-            toggle_clk(sim)
-            # set/unset valids
-            if ncycles % ncycles_per_input == 0:
-                set_signal(sim, "tvalid", 1)
-            else:
-                set_signal(sim, "tvalid", 0)
+                # since latency estimation is very pessimistic, detect first output
+                # and fast-forward the sim
+                if get_signal(sim, "tvalid") != 0 and not output_detected:
+                    ncycles = max_cycles
+                    output_detected = True
+                else:
+                    ncycles = ncycles - 1
 
-            # since latency estimation is very pessimistic, detect first output
-            # and fast-forward the sim
-            if get_signal(sim, "tvalid") != 0 and not output_detected:
-                ncycles = max_cycles
-                output_detected = True
+            if not output_detected:
+                warnings.warn(
+                    "No output detected, calculated FIFO depths may not be correct"
+                )
+        else:
+            # do rtlsim in C++ for FIFO sizing
+            # determine # inputs for FIFO sizing according to topology type
+            swg_nodes = [
+                x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type
+            ]
+            if len(swg_nodes) == 0:
+                # MLP, no layer overlap
+                # assuming half the nodes are now FIFOs, use half the # of
+                # nodes as # inputs to drive the imulation
+                n_inputs = int(len(model.graph.node) / 2)
             else:
-                ncycles = ncycles - 1
-
-        if not output_detected:
-            warnings.warn(
-                "No output detected, calculated FIFO depths may not be correct"
-            )
+                # convnet, single input is typically enough to fill entire
+                # layer pipeline due to overlaps
+                n_inputs = 1
+            sim = verilator_fifosim(model, n_inputs)
 
         for ind, node in enumerate(fifo_nodes):
             maxcount_name = "maxcount_%d" % ind
@@ -365,7 +400,7 @@ class InsertAndSetFIFODepths(Transformation):
             fifos[node.name] = sim[maxcount_name]
 
         # Apply depths back into the model;
-        # also set in/outFIFODepth to zero for non-FIFO
+        # also set in/outFIFODepths to zero for non-FIFO
         # nodes, preventing further FIFO insertion
         for node in model.graph.node:
             # set FIFO depth, reset FIFO implementation,
@@ -414,3 +449,126 @@ class InsertAndSetFIFODepths(Transformation):
         model = model.transform(RemoveShallowFIFOs())
 
         return (model, False)
+
+
+def get_fifo_split_configs(depth, max_qsrl_depth=256, max_vivado_depth=32768):
+    """Break non-power-of-2 sized FIFO depths into several ones"""
+
+    def floor_pow2(x):
+        if (x & (x - 1) == 0) and x != 0:
+            return x
+        else:
+            return 1 << ((x - 1).bit_length() - 1)
+
+    def decompose_pow2(x):
+        if x <= max_qsrl_depth:
+            return [x]
+        else:
+            r = floor_pow2(x)
+            if x == r:
+                return [x]
+            else:
+                return [r, *decompose_pow2(x - r)]
+
+    ret = []
+    # trivial case: for small FIFOs, return as-is with rtl style
+    if depth <= max_qsrl_depth:
+        return [(depth, "rtl")]
+    # first pass: ensure max depth is respected
+    # (restricted by Vivado AXIS infra IP)
+    remainder = depth
+    while remainder != 0:
+        if remainder > max_vivado_depth:
+            ret.append(max_vivado_depth)
+            remainder -= max_vivado_depth
+        else:
+            ret.append(remainder)
+            remainder = 0
+    # second pass: break non-power-of-2 sized FIFOs
+    # into several ones
+
+    ret_pass2 = list(map(decompose_pow2, ret))
+    # unpack list of lists
+    ret_pass2 = [x for dec_list in ret_pass2 for x in dec_list]
+
+    # finally, add impl_style to each split FIFO
+    ret_final = []
+    for cand_depth in ret_pass2:
+        if cand_depth <= max_qsrl_depth:
+            ret_final.append((cand_depth, "rtl"))
+        else:
+            ret_final.append((cand_depth, "vivado"))
+
+    return ret_final
+
+
+class SplitLargeFIFOs(Transformation):
+    """Split large FIFOs before implementation, for two reasons:
+
+    - impl_style="vivado" supports a max depth of 32k. Any larger
+      FIFOs must be implemented as a sequence of smaller FIFOs.
+    - impl_style="vivado" requires power-of-two depths, which is
+      normally handled by rounding up to the nearest power-of-two.
+      So a FIFO of size 8196 normally gets rounded-up to a depth of
+      16384 and wastes a lot of resources. Here, instead, we split
+      this up into two FIFOs of depth 8192 + 4.
+
+    """
+
+    def __init__(self, max_qsrl_depth=256, max_vivado_depth=32768):
+        super().__init__()
+        self.max_qsrl_depth = max_qsrl_depth
+        self.max_vivado_depth = max_vivado_depth
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "StreamingFIFO":
+                n_inst = getCustomOp(node)
+                depth = n_inst.get_nodeattr("depth")
+                cfgs = get_fifo_split_configs(
+                    depth, self.max_qsrl_depth, self.max_vivado_depth
+                )
+                if len(cfgs) > 1:
+                    fld_shape = n_inst.get_folded_output_shape()
+                    dtype = n_inst.get_nodeattr("dataType")
+                    ram_style = n_inst.get_nodeattr("ram_style")
+                    shape = model.get_tensor_shape(node.input[0])
+                    for i, (fifo_depth, impl_style) in enumerate(cfgs):
+                        if i == 0:
+                            inp = node.input[0]
+                        else:
+                            inp = node.name + "_" + str(i - 1) + "_out"
+                        if i == len(cfgs) - 1:
+                            outp = node.output[0]
+                        else:
+                            outp = node.name + "_" + str(i) + "_out"
+                            out_tensor = helper.make_tensor_value_info(
+                                outp, TensorProto.FLOAT, shape
+                            )
+                            graph.value_info.append(out_tensor)
+                            model.set_tensor_datatype(out_tensor.name, DataType[dtype])
+                        fifo_node = helper.make_node(
+                            "StreamingFIFO",
+                            [inp],
+                            [outp],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=dtype,
+                            impl_style=impl_style,
+                            ram_style=ram_style,
+                            name=node.name + "_" + str(i),
+                        )
+                        graph.node.insert(node_ind + i, fifo_node)
+
+                    graph.node.remove(node)
+                    graph_modified = True
+        if graph_modified:
+            model = model.transform(SortGraph())
+            model = model.transform(GiveReadableTensorNames())
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index e24e24f1f8ebb2873c81617884cd333311d8aea9..2301fccdd4fff6310340ffe1dd8de7732a4f9bd4 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -62,17 +62,20 @@ class SetFolding(Transformation):
 
     Notable exceptions and special behavior:
 
-    * When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
+    When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
     which have two attributes (PE and SIMD):
-        * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
-          (configurable in the SetFolding initializer, defaults to 36)
-        * then increases PE until the target is met or max PE reached
 
-    * When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
+    * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
+      (configurable in the SetFolding initializer, defaults to 36)
+    * then increases PE until the target is met or max PE reached
+
+    When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
     or spatial reduction ops (Pool_Batch):
-        * the producer of the node is expected to be a ConvolutionInputGenerator
-        with depthwise=1, whose SIMD value will be set equal to the PE value of
-        its consumer node
+
+    * the producer of the node is expected to be a ConvolutionInputGenerator
+      with depthwise=1, whose SIMD value will be set equal to the PE value of
+      its consumer node
+
     """
 
     def __init__(
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 855b30fe9573c534a13c961277ae4ab84507d619..e0a5666000fc2aa9599bb7475c1b8dd37489afac 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -358,16 +358,16 @@ class VitisBuild(Transformation):
     """Best-effort attempt at building the accelerator with Vitis.
     It assumes the model has only fpgadataflow nodes
 
-    fpga_part: string identifying the target FPGA
-    period_ns: target clock period
-    platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
-    strategy: Vitis optimization strategy
-    enable_debug: add Chipscope to all AXI interfaces
-    floorplan_file: path to a JSON containing a dictionary with SLR assignments
-                    for each node in the ONNX graph. Must be parse-able by
-                    the ApplyConfig transform.
-    enable_link: enable linking kernels (.xo files), otherwise just synthesize
-                    them independently.
+    :parameter fpga_part: string identifying the target FPGA
+    :parameter period_ns: target clock period
+    :parameter platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
+    :parameter strategy: Vitis optimization strategy
+    :parameter enable_debug: add Chipscope to all AXI interfaces
+    :parameter floorplan_file: path to a JSON containing a dictionary with
+        SLR assignments for each node in the ONNX graph.
+        Must be parse-able by the ApplyConfig transform.
+    :parameter enable_link: enable linking kernels (.xo files),
+        otherwise just synthesize them independently.
     """
 
     def __init__(
@@ -411,12 +411,13 @@ class VitisBuild(Transformation):
         # Build each kernel individually
         sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
         for sdp_node in sdp_nodes:
+            prefix = sdp_node.name + "_"
             sdp_node = getCustomOp(sdp_node)
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
             kernel_model = kernel_model.transform(RemoveUnusedTensors())
-            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
+            kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
             kernel_model = kernel_model.transform(
                 PrepareIP(self.fpga_part, self.period_ns)
diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
index 967a1276365e4af1a6d617c081b9c04b4710da97..34f11d1e95e6bc3f6a36ce6d878ed493108b3ba6 100644
--- a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
+++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
@@ -56,12 +56,12 @@ class ConvertQONNXtoFINN(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index a50a5850779cadf7ab21b9c1c4dfdbb36232af42..9819086d826a51d1df5240d88c4fda8513cc9ba6 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -52,9 +52,7 @@ class QuantActBaseHandler(ABC):
         self._q_node = quant_node
         self._q_index = quant_node_index
 
-    @property
     @classmethod
-    @abstractmethod
     def valid_predecessor_op_types(self):
         """Defines which op types the preceding node is allowed to have for
         this type of activation.
@@ -284,9 +282,11 @@ class QuantReluHandler(QuantActBaseHandler):
     """Class for converting a quantized relu operation expressed in the QONNX
     dialect to the FINN ONNX dialect."""
 
-    valid_predecessor_op_types = [
-        "Relu",
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "Relu",
+        ]
 
     def _check_compatibility(self):
         if self._q_node.op_type == "Quant":
@@ -391,15 +391,17 @@ class QuantIdentityHandler(QuantActBaseHandler):
     these are equivalent to quantized identity activations.
     """
 
-    valid_predecessor_op_types = [
-        "BatchNormalization",
-        "Sub",
-        "Add",
-        "Mul",
-        "Div",
-        "DebugMarker",
-        None,
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "BatchNormalization",
+            "Sub",
+            "Add",
+            "Mul",
+            "Div",
+            "DebugMarker",
+            None,
+        ]
 
     def _check_compatibility(self):
         # Gather parameters to check
diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
index 77025ecdf57d5a422992d4163d05c740454986bb..48dda3820deb051bd8a291188f02fe7d1dd2cc0b 100644
--- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
+++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
@@ -30,7 +30,10 @@
 import warnings
 from qonnx.transformation.base import Transformation
 
-from finn.transformation.qonnx.qonnx_activation_handlers import QuantActBaseHandler
+from finn.transformation.qonnx.qonnx_activation_handlers import (
+    QuantActBaseHandler,
+    QuantIdentityHandler,
+)
 
 
 def default_filter_function_generator(max_multithreshold_bit_width=8):
@@ -66,8 +69,7 @@ def default_filter_function_generator(max_multithreshold_bit_width=8):
 
 
 class ConvertQuantActToMultiThreshold(Transformation):
-    """
-    Converts Quant nodes in the activation path to MultiThreshold nodes.
+    """Converts Quant nodes in the activation path to MultiThreshold nodes.
 
     The optional keyword argument `filter_function`
     presents a way to control which Quant and BipolarQuant nodes in the activation path
@@ -75,12 +77,12 @@ class ConvertQuantActToMultiThreshold(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
@@ -127,7 +129,7 @@ class ConvertQuantActToMultiThreshold(Transformation):
                 # Check for possible ambiguity in handler selection
                 valid_predecessors = []
                 for cls in QuantActBaseHandler.__subclasses__():
-                    valid_predecessors.extend(cls.valid_predecessor_op_types)
+                    valid_predecessors.extend(cls.valid_predecessor_op_types())
                 if len(valid_predecessors) != len(set(valid_predecessors)):
                     raise RuntimeError(
                         "Two or more activation handlers declare the same "
@@ -138,16 +140,15 @@ class ConvertQuantActToMultiThreshold(Transformation):
 
                 # Try to find a fitting handler for this Quant activation node
                 for handler_cls in QuantActBaseHandler.__subclasses__():
-                    if predecessor_op_type in handler_cls.valid_predecessor_op_types:
+                    if predecessor_op_type in handler_cls.valid_predecessor_op_types():
                         handler = handler_cls(model, n, node_ind)
                         break
                 else:
-                    raise ValueError(
-                        f"Quant nodes in the activation path and with predecessor "
-                        f"nodes of type {predecessor_op_type} are currently not "
-                        f"supported by FINN and can not be converted to "
-                        f"MultiThreshold nodes."
-                    )
+                    # fall back to QuantIdentityHandler here
+                    # it may still not work due to its particular restrictions,
+                    # but better than just erroring out without trying
+                    handler = QuantIdentityHandler(model, n, node_ind)
+
                 model = handler.replace_quant_node()
                 graph_modified = True
                 return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 50dcbaa0edfd8235e3b313fcf7ed726a3e92f33e..73df52f890d227137ea076804d161206e66653dc 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -582,7 +582,6 @@ class AbsorbTransposeIntoResize(Transformation):
                             trans_input = mt_cand.output[0]
                             trans_output = new_tensor_name
                         # fix tensor shapes for Resize and Transpose
-                        # n, c, h, w = model.get_tensor_shape(mt_cand.input[0])
                         n, c, hx, wx = model.get_tensor_shape(mt_cand.output[0])
                         model.set_tensor_shape(trans_input, (n, hx, wx, c))
                         model.set_tensor_shape(trans_output, (n, c, hx, wx))
@@ -593,13 +592,13 @@ class AbsorbTransposeIntoResize(Transformation):
                             [trans_output],
                             perm=[0, 3, 1, 2],
                         )
-                        graph.node.insert(node_ind + 1, new_transpose)
                         # rewire nodes
                         final_t_cands = model.find_consumers(mt_cand.output[0])
                         # rewire next nodes' inputs
                         for final_t_cand in final_t_cands:
                             final_t_cand.input[0] = trans_output
                         mt_cand.output[0] = trans_input
+                        graph.node.insert(node_ind + 1, new_transpose)
                         graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 65478d2540b53443d3f74b44a22fde3defd8ca93..797dad32a2cfeb3e00e224f264d91b5ee0e9247b 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -265,7 +265,7 @@ def numpy_to_hls_code(
     # define a function to convert a single element into a C++ init string
     # a single element can be a hex string if we are using packing
     def elem2str(x):
-        if type(x) == str or type(x) == np.str_ or type(x) == np.str:
+        if type(x) == str or type(x) == np.str_:
             return '%s("%s", 16)' % (hls_dtype, x)
         elif type(x) == np.float32:
             if dtype.is_integer():
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index d7ed3e261fe024b7f054382f12184628d3f3e94c..8d188585694c172d97d73fa6b5820edb7b48a948 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -28,33 +28,41 @@
 
 import pkg_resources as pk
 
+import numpy as np
 import os
 import shutil
 from pyverilator import PyVerilator
+from qonnx.custom_op.registry import getCustomOp
 
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import (
+    get_rtlsim_trace_depth,
+    launch_process_helper,
+    make_build_dir,
+)
 
 
-def pyverilate_stitched_ip(
-    model,
-    read_internal_signals=True,
-    disable_common_warnings=True,
-    extra_verilator_args=[],
-):
-    """Given a model with stitched IP, return a PyVerilator sim object.
-    Trace depth is also controllable, see get_rtlsim_trace_depth()
+def make_single_source_file(filtered_verilog_files, target_file):
+    """Dump all Verilog code used by stitched IP into a single file.
+    This is because large models with many files require a verilator
+    command line too long for bash on most systems"""
 
-    :param read_internal_signals  If set, it will be possible to examine the
-        internal (not only port) signals of the Verilog module, but this may
-        slow down compilation and emulation.
+    # concatenate all verilog code into a single file
+    with open(target_file, "w") as wf:
+        for vfile in filtered_verilog_files:
+            with open(vfile) as rf:
+                wf.write("//Added from " + vfile + "\n\n")
+                lines = rf.read()
+                for line in lines.split("\n"):
+                    # break down too-long lines, Verilator complains otherwise
+                    if len(line) > 20000:
+                        line = line.replace("&", "\n&")
+                    wf.write("\n" + line)
 
-    :param disable_common_warnings If set, disable the set of warnings that
-        Vivado-HLS-generated Verilog typically triggers in Verilator
-        (which can be very verbose otherwise)
 
-    """
-    if PyVerilator is None:
-        raise ImportError("Installation of PyVerilator is required.")
+def prepare_stitched_ip_for_verilator(model):
+    """Prepare sources from given stitched IP for verilator simulation, including
+    generating a single source file and replacing certain Vivado infrastructure
+    headers with Verilator-compatible ones"""
 
     vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
     with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
@@ -67,8 +75,6 @@ def pyverilate_stitched_ip(
         return os.path.basename(os.path.realpath(x))
 
     top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
-    top_module_name = top_module_file_name.strip(".v")
-    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     # dump all Verilog code to a single file
     # this is because large models with many files require
@@ -79,7 +85,7 @@ def pyverilate_stitched_ip(
     # remove duplicates from list by doing list -> set -> list
     src_exts = [".v", ".sv"]
 
-    all_verilog_src_files = list(
+    all_verilog_files = list(
         set(
             filter(
                 lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs
@@ -87,7 +93,9 @@ def pyverilate_stitched_ip(
         )
     )
 
-    verilog_header_dir = make_build_dir("pyverilator_vh_")
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+    os.makedirs(verilog_header_dir, exist_ok=True)
+
     # use custom version of axis infrastructure vh
     # to enable Verilator to simulate AMD/Xilinx components (e.g DWC)
     custom_vh = pk.resource_filename(
@@ -105,7 +113,7 @@ def pyverilate_stitched_ip(
     # remove all but one instances of regslice_core.v
     filtered_verilog_files = []
     remove_entry = False
-    for vfile in all_verilog_src_files:
+    for vfile in all_verilog_files:
         if "regslice_core" in vfile:
             if not remove_entry:
                 filtered_verilog_files.append(vfile)
@@ -113,17 +121,176 @@ def pyverilate_stitched_ip(
         else:
             filtered_verilog_files.append(vfile)
 
-    # concatenate all verilog code into a single file
-    with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf:
-        for vfile in filtered_verilog_files:
-            with open(vfile) as rf:
-                wf.write("//Added from " + vfile + "\n\n")
-                lines = rf.read()
-                for line in lines.split("\n"):
-                    # break down too-long lines, Verilator complains otherwise
-                    if len(line) > 20000:
-                        line = line.replace("&", "\n&")
-                    wf.write("\n" + line)
+    target_file = vivado_stitch_proj_dir + "/" + top_module_file_name
+    make_single_source_file(filtered_verilog_files, target_file)
+
+    return vivado_stitch_proj_dir
+
+
+def verilator_fifosim(model, n_inputs, max_iters=100000000):
+    """Create a Verilator model of stitched IP and use a simple C++
+    driver to drive the input stream. Useful for FIFO sizing, latency
+    and throughput measurement."""
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+    build_dir = make_build_dir("verilator_fifosim_")
+    fifosim_cpp_fname = pk.resource_filename(
+        "finn.qnn-data", "cpp/verilator_fifosim.cpp"
+    )
+    with open(fifosim_cpp_fname, "r") as f:
+        fifosim_cpp_template = f.read()
+    assert len(model.graph.input) == 1, "Only a single input stream is supported"
+    assert len(model.graph.output) == 1, "Only a single output stream is supported"
+    iname = model.graph.input[0].name
+    first_node = model.find_consumer(iname)
+    oname = model.graph.output[0].name
+    last_node = model.find_producer(oname)
+    assert (first_node is not None) and (
+        last_node is not None
+    ), "Failed to find first/last nodes"
+    fnode_inst = getCustomOp(first_node)
+    lnode_inst = getCustomOp(last_node)
+    ishape_folded = fnode_inst.get_folded_input_shape()
+    oshape_folded = lnode_inst.get_folded_output_shape()
+
+    fifo_log = []
+    fifo_log_templ = '    results_file << "maxcount%s" << "\\t" '
+    fifo_log_templ += "<< to_string(top->maxcount%s) << endl;"
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+    fifo_ind = 0
+    for fifo_node in fifo_nodes:
+        fifo_node = getCustomOp(fifo_node)
+        if fifo_node.get_nodeattr("depth_monitor") == 1:
+            suffix = "" if fifo_ind == 0 else "_%d" % fifo_ind
+            fifo_log.append(fifo_log_templ % (suffix, suffix))
+            fifo_ind += 1
+    fifo_log = "\n".join(fifo_log)
+
+    template_dict = {
+        "ITERS_PER_INPUT": np.prod(ishape_folded[:-1]),
+        "ITERS_PER_OUTPUT": np.prod(oshape_folded[:-1]),
+        "N_INPUTS": n_inputs,
+        "MAX_ITERS": max_iters,
+        "FIFO_DEPTH_LOGGING": fifo_log,
+    }
+
+    for (key, val) in template_dict.items():
+        fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val))
+
+    with open(build_dir + "/verilator_fifosim.cpp", "w") as f:
+        f.write(fifosim_cpp_template)
+
+    which_verilator = shutil.which("verilator")
+    if which_verilator is None:
+        raise Exception("'verilator' executable not found")
+
+    # add defines to make certain XPM src files work with Verilator
+    xpm_args = []
+    xpm_args.append("-DDISABLE_XPM_ASSERTIONS")
+    xpm_args.append("-DOBSOLETE")
+    xpm_args.append("-DONESPIN")
+    xpm_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
+    verilog_file_arg = ["finn_design_wrapper.v", xpm_memory, xpm_cdc, xpm_fifo]
+
+    verilator_args = [
+        "perl",
+        which_verilator,
+        "-Wno-fatal",
+        "-Mdir",
+        build_dir,
+        "-y",
+        vivado_stitch_proj_dir,
+        "-y",
+        verilog_header_dir,
+        "--CFLAGS",
+        "--std=c++11",
+        "-O3",
+        "--x-assign",
+        "fast",
+        "--x-initial",
+        "fast",
+        "--noassert",
+        "--cc",
+        *verilog_file_arg,
+        "--top-module",
+        "finn_design_wrapper",
+        "--exe",
+        "verilator_fifosim.cpp",
+        "--threads",
+        "4",
+        *xpm_args,
+    ]
+
+    proc_env = os.environ.copy()
+    gcc_args = "-O3 -march=native"
+    proc_env["OPT_FAST"] = gcc_args
+    make_args = [
+        "make",
+        "-j4",
+        "-C",
+        build_dir,
+        "-f",
+        "Vfinn_design_wrapper.mk",
+        "Vfinn_design_wrapper",
+    ]
+
+    with open(build_dir + "/compile.sh", "w") as f:
+        f.write("#!/bin/bash" + "\n")
+        f.write("export OPT_FAST='%s'\n" % gcc_args)
+        f.write(" ".join(verilator_args) + "\n")
+        f.write(" ".join(make_args) + "\n")
+
+    launch_process_helper(verilator_args, cwd=build_dir)
+    launch_process_helper(make_args, proc_env=proc_env, cwd=build_dir)
+
+    sim_launch_args = ["./Vfinn_design_wrapper"]
+    launch_process_helper(sim_launch_args, cwd=build_dir)
+
+    with open(build_dir + "/results.txt", "r") as f:
+        results = f.read().strip().split("\n")
+    ret_dict = {}
+    for result_line in results:
+        key, val = result_line.split("\t")
+        ret_dict[key] = int(val)
+    return ret_dict
+
+
+def pyverilate_stitched_ip(
+    model,
+    read_internal_signals=True,
+    disable_common_warnings=True,
+    extra_verilator_args=[],
+):
+    """Given a model with stitched IP, return a PyVerilator sim object.
+    Trace depth is also controllable, see get_rtlsim_trace_depth()
+
+    :param read_internal_signals  If set, it will be possible to examine the
+        internal (not only port) signals of the Verilog module, but this may
+        slow down compilation and emulation.
+
+    :param disable_common_warnings If set, disable the set of warnings that
+        Vivado-HLS-generated Verilog typically triggers in Verilator
+        (which can be very verbose otherwise)
+
+    """
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+
+    def file_to_basename(x):
+        return os.path.basename(os.path.realpath(x))
+
+    top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+    top_module_name = top_module_file_name.strip(".v")
+    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     verilator_args = []
     # disable common verilator warnings that should be harmless but commonly occur
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index bfe4aa0bb826c73f6a7c67f025e24764da8c36cc..bd8bde2820fa87ed972d699cae905d7f6cc310ff 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -91,8 +91,8 @@ def soft_verify_topk(invec, idxvec, k):
     """Check that the topK indices provided actually point to the topK largest
     values in the input vector"""
     np_topk = np.flip(invec.flatten().argsort())[:k]
-    soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()]
-    soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()]
+    soft_expected = invec.flatten()[np_topk.astype(np.int_).flatten()]
+    soft_produced = invec.flatten()[idxvec.astype(np.int_).flatten()]
     return (soft_expected == soft_produced).all()
 
 
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
index aaeb3ab920d1d8fae79c1173582d18cf81d03063..1f77276d5a72e5f886d5f94af8d35121ccadd486 100644
--- a/src/finn/util/vcd.py
+++ b/src/finn/util/vcd.py
@@ -101,19 +101,21 @@ def get_stream_if_stats(vcd_file, if_base_name):
     <stream_state>: (<num_samples>, <fraction_of_time>),
 
     where <stream_state> is the combination of (V)alid/(R)eady values,
-    <num_samples> is the approximate number of rising clock edges spent in <state>
-    , and <fraction_of_time> is the fraction of <num_samples> to total
+    <num_samples> is the approximate number of rising clock edges spent in <state>,
+    and <fraction_of_time> is the fraction of <num_samples> to total
     amount of time recorded by the trace.
 
     Example:
-    {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
-     "{'V': 1, 'R': 0}": (0, 0.0),
-     "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
-     "{'V': 1, 'R': 1}": (640, 0.07757575757575758)}
-
+    {
+    "{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
+    "{'V': 1, 'R': 0}": (0, 0.0),
+    "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
+    "{'V': 1, 'R': 1}": (640, 0.07757575757575758)
+    }
     Here we can see the stream was transmitting values 7.7% of the time,
     and 9.2% of the time there was no incoming data (valid 0, ready 1)
     """
+
     if_valid = if_base_name + vname
     if_ready = if_base_name + rname
     v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 5f787d1f889645d04884aed9b89a0b1c91d1f418..79cfafa22d670f168c3c03a5ef01a51256912a8c 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -80,7 +80,6 @@ from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
@@ -103,7 +102,7 @@ from finn.util.test import (
 )
 
 build_dir = os.environ["FINN_BUILD_DIR"]
-target_clk_ns = 10
+target_clk_ns = 20
 mem_mode = "decoupled"
 rtlsim_trace = False
 
@@ -597,7 +596,6 @@ class TestEnd2End:
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-        model = model.transform(PrepareRTLSim())
         model.set_metadata_prop("exec_mode", "rtlsim")
         os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1))
         if rtlsim_trace:
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index b6482dc96c4d866618d19d810fa9385b20aa0222..290afc308498490cbee2fc75c30e22bb474eb96a 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -229,6 +229,7 @@ def test_end2end_cybsec_mlp_build(QONNX_export):
 
 
 @pytest.mark.end2end
+@pytest.mark.xfail
 @pytest.mark.parametrize("QONNX_export", [False, True])
 def test_end2end_cybsec_mlp_run_on_hw(QONNX_export):
     build_env = get_build_env(build_kind, target_clk_ns)
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index 9483ccf0b27ebc385ed017d0a0b316ab189a1f96..0a92c74a38d64ade37d576f3830f3a5628c94d88 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -90,6 +90,7 @@ def test_end2end_ext_weights_build():
     output_dir = make_build_dir("test_end2end_ext_weights_build")
     cfg = build.DataflowBuildConfig(
         output_dir=output_dir,
+        verbose=True,
         folding_config_file=folding_config_file,
         synth_clk_period_ns=target_clk_ns,
         board=build_env["board"],
@@ -113,6 +114,7 @@ def test_end2end_ext_weights_build():
 
 @pytest.mark.board
 @pytest.mark.end2end
+@pytest.mark.xfail
 def test_end2end_ext_weights_dataset():
     # make sure we have local copies of mnist dataset files
     subprocess.check_output(["mkdir", "-p", mnist_local])
@@ -129,6 +131,7 @@ def test_end2end_ext_weights_dataset():
 
 
 @pytest.mark.end2end
+@pytest.mark.xfail
 def test_end2end_ext_weights_run_on_hw():
     build_env = get_build_env(build_kind, target_clk_ns)
     deploy_dir = get_checkpoint_name("build")
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 5fd1439bd055782692bac404622137e166ef5e07..6b78d399eb100686277a92f1e35b9a98b433444b 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -50,13 +50,19 @@ def fetch_test_model(topology, wbits=2, abits=2):
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.fpgadataflow
-def test_fifosizing_linear():
+@pytest.mark.parametrize(
+    "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
+)
+def test_fifosizing_linear(method):
+    force_python_rtlsim = "python" in method
+    method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
     tmp_output_dir = fetch_test_model("tfc")
     cfg = build_cfg.DataflowBuildConfig(
         output_dir=tmp_output_dir,
         auto_fifo_depths=True,
-        auto_fifo_strategy="characterize",
+        auto_fifo_strategy=method_key,
         target_fps=10000,
+        force_python_rtlsim=force_python_rtlsim,
         synth_clk_period_ns=10.0,
         board="Pynq-Z1",
         rtlsim_batch_size=100,
diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
index 8488a34dff52d39c28fbea25275c9a4b59c37f80..5fff286e54e64b71481a3c2801850a37613fd694 100644
--- a/tests/fpgadataflow/test_fpgadataflow_concat.py
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -72,6 +72,7 @@ def make_concat_model(i_shapes, idt):
 
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.parametrize("idt", [DataType["INT4"]])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_concat(exec_mode, idt):
@@ -107,6 +108,7 @@ def test_fpgadataflow_concat(exec_mode, idt):
     assert (exp_out == ret_sim[oname]).all()
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_concat_stitchedip():
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 80f2d724ad7ccbf563c23076155313bad1ecb336..325470a6d6c6032249ca1dd64317fb288d3e94c9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -348,6 +348,7 @@ def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw):
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(fpga_part, period_ns))
+    model = model.transform(HLSSynthIP())
     model = model.transform(VitisBuild(fpga_part, period_ns, platform))
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx")
     assert model.get_metadata_prop("platform") == "alveo"
diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py
new file mode 100644
index 0000000000000000000000000000000000000000..85b4a2bfa8dc0de3cbdd0ca34ec5b1ee68f37acf
--- /dev/null
+++ b/tests/fpgadataflow/test_split_large_fifos.py
@@ -0,0 +1,128 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pytest
+
+import json
+import shutil
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.transformation.fpgadataflow.set_fifo_depths import get_fifo_split_configs
+from finn.util.basic import make_build_dir
+from finn.util.test import get_trained_network_and_ishape
+
+
+def fetch_test_model(topology, wbits=2, abits=2):
+    tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
+    (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
+    chkpt_name = tmp_output_dir + "/model.onnx"
+    BrevitasONNXManager.export(model, ishape, chkpt_name)
+    return tmp_output_dir
+
+
+def get_folding_cfg(depth=65536):
+    cfg = dict()
+    cfg["Defaults"] = dict()
+    for i in range(3):
+        key = "StreamingFIFO_" + str(i)
+        cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"}
+    return cfg
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize("depth", [16384, 65536, 45000])
+@pytest.mark.parametrize("force_python_rtlsim", ["True", "False"])
+def test_split_large_fifos(depth, force_python_rtlsim):
+    tmp_output_dir = fetch_test_model("tfc")
+    folding_cfg = get_folding_cfg(depth)
+    with open(tmp_output_dir + "/folding_config.json", "w") as f:
+        json.dump(folding_cfg, f, indent=2)
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=False,
+        split_large_fifos=True,
+        folding_config_file=tmp_output_dir + "/folding_config.json",
+        target_fps=10000,
+        force_python_rtlsim=force_python_rtlsim,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        rtlsim_batch_size=100,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+        default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED,
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+    assert (
+        float(sim_data["throughput[images/s]"])
+        / float(est_data["estimated_throughput_fps"])
+        > 0.9
+    )
+    model = ModelWrapper(
+        tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx"
+    )
+    # exclude final FIFO node (output FIFO, not part of test)
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")[:-1]
+    golden_cfg = get_fifo_split_configs(depth, 256, 32768)
+    for i, fifo_node in enumerate(fifo_nodes):
+        inst = getCustomOp(fifo_node)
+        fifo_depth = inst.get_nodeattr("depth")
+        assert fifo_depth == golden_cfg[i % len(golden_cfg)][0]
+
+    shutil.rmtree(tmp_output_dir)
+
+
+def test_split_large_fifo_configs():
+    ret0 = get_fifo_split_configs(513, 256, 32768)
+    assert ret0 == [(512, "vivado"), (1, "rtl")]
+    ret1 = get_fifo_split_configs(1200, 256, 32768)
+    assert ret1 == [(1024, "vivado"), (176, "rtl")]
+    ret2 = get_fifo_split_configs(45000, 256, 32768)
+    assert ret2 == [
+        (32768, "vivado"),
+        (8192, "vivado"),
+        (2048, "vivado"),
+        (1024, "vivado"),
+        (512, "vivado"),
+        (256, "rtl"),
+        (200, "rtl"),
+    ]
diff --git a/tests/transformation/streamline/test_move_flatten_past_topk.py b/tests/transformation/streamline/test_move_flatten_past_topk.py
index 83d7a28c05fbd95834e5d84ab7537ae82c285d17..d1478088e2e8caaeb33fbec2880e74ea65905073 100644
--- a/tests/transformation/streamline/test_move_flatten_past_topk.py
+++ b/tests/transformation/streamline/test_move_flatten_past_topk.py
@@ -47,7 +47,7 @@ from finn.transformation.streamline.reorder import MoveFlattenPastTopK
 @pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
 # batch size
 @pytest.mark.parametrize("batch_size", [1, 2])
-def test_move_flatten_past_affine(data_layout, batch_size):
+def test_move_flatten_past_topk(data_layout, batch_size):
     if data_layout == DataLayout.NHWC:
         ishape = [batch_size, 1, 1, 1024]
         oshape = [batch_size, 1024]