diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 4374111f22a12e586c5c5233a7eee096b848b86e..00c25a4a3150a8368405b449fdce04456ccbe88d 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -1,17 +1,18 @@
 name: DockerImage
 
 on:
+  pull_request:
+    branches: [ dev ]
   push:
-    branches:
-      - 'dev'
+    branches: [ dev ]
 
 jobs:
   docker:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
       -
         name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v1
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 2fbb9265beb49644f08a2c6e916ab9c23d4bd339..5f03379bbc37ab913f712571c630035dbad84cce 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -13,10 +13,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
 
       - name: Run Lint
-        uses: pre-commit/action@v2.0.0
+        uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/quicktest-dev-pr.yml b/.github/workflows/quicktest-dev-pr.yml
index d188007465cd27662ffadfb3ece0d8bf2e8e28be..e2ba47ec296f73cfd7c0eede98bac3acd066075a 100644
--- a/.github/workflows/quicktest-dev-pr.yml
+++ b/.github/workflows/quicktest-dev-pr.yml
@@ -11,43 +11,15 @@ jobs:
 
   test:
     name: Run quicktest on PR branch
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
 
     steps:
       - name: checkout
-        uses: actions/checkout@v2
-
-      - name: set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
-      - name: cache Docker layers
-        uses: actions/cache@v2
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
-
-      - name: Build and push
-        uses: docker/build-push-action@v2
-        with:
-          file: docker/Dockerfile.finn
-          context: .
-          push: false
-          load: true
-          tags: finn_gha
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache-new
-      -
-        # Temp fix
-        # https://github.com/docker/build-push-action/issues/252
-        # https://github.com/moby/buildkit/issues/1896
-        name: Move cache
-        run: |
-          rm -rf /tmp/.buildx-cache
-          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
-
+        uses: actions/checkout@v3
 
       - name: DockerRunQuicktest
         run: |
-          docker run --init --hostname finn_gha -w $(pwd) -v $(pwd):$(pwd) -e FINN_BUILD_DIR=/tmp/finn_gha -e FINN_INST_NAME=finn_gha finn_gha quicktest.sh
+          export FINN_ROOT=$(pwd)
+          export FINN_BUILD_DIR=/tmp/finn_gha
+          export FINN_INST_NAME=finn_gha
+          ./run-docker.sh quicktest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dfc83ba618eb905fe5579231542d14d529503ac2..126a4ac4b2bee7f3eaaf610646855b48d07b9e32 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
     args: ['--fix=no']
 
 - repo: https://github.com/PyCQA/isort
-  rev: 5.10.1
+  rev: 5.12.0
   hooks:
   - id: isort
 
@@ -61,7 +61,7 @@ repos:
   - id: black
     language_version: python3
 
-- repo: https://gitlab.com/pycqa/flake8
+- repo: https://github.com/PyCQA/flake8
   rev: 3.9.2
   hooks:
   - id: flake8
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 3601fcdccff675e6f850d4636ebbfc0726f7cd4d..478957be113b686c4fabd3d071fdf6203dd37dd3 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -35,7 +35,7 @@ sphinx:
    configuration: docs/finn/conf.py
 
 python:
-   version: 3.7
+   version: 3.8
    install:
     - method: pip
       path: .
diff --git a/AUTHORS.rst b/AUTHORS.rst
index d011ce3d7ad74125b7013b7a7e987eb22e70a9f3..861b81924b187620d77f8cd47d4faff8d7f15bf8 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -9,7 +9,7 @@ Contributors
 * Hendrik Borras (@HenniOVP)
 * Lucian Petrica (@quetric)
 * Tobias Alonso (@Tobi-Alonso)
-* Felix Paul Jentzsch (@felixpj)
+* Felix Paul Jentzsch (@fpjentzsch)
 * Mirza Mrahorovic (@mmrahorovic)
 * Suranga Mahesh (@surangamh)
 * Peter Lehnhardt (@pete-lennart)
@@ -26,3 +26,5 @@ Contributors
 * Aziz Bahri (@azizb-xlnx)
 * Fionn O'Donohoe (@fionnodonohoe-xlnx)
 * Matthias Gehre (@mgehre-amd)
+* Hugo Le Blevec (@hleblevec)
+* Patrick Geel (@patrickgeel)
diff --git a/README.md b/README.md
index 1b8efc8f19d0b664a17320585f5ea60acbe03eb4..2e1faf8f0c4422c8690506bb5f79611c6661fa9c 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s
 
 ## Documentation
 
-You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/master/notebooks), which we recommend running from inside Docker for a better experience.
+You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience.
 
 ## Community
 
@@ -67,4 +67,4 @@ The current implementation of the framework is based on the following publicatio
 ## Old version
 
 We previously released an early-stage prototype of a toolflow that took in Caffe-HWGQ binarized network descriptions and produced dataflow architectures. You can find it in the [v0.1](https://github.com/Xilinx/finn/tree/v0.1) branch in this repository.
-Please be aware that this version is deprecated and unsupported, and the master branch does not share history with that branch so it should be treated as a separate repository for all purposes.
+Please be aware that this version is deprecated and unsupported, and the main branch does not share history with that branch so it should be treated as a separate repository for all purposes.
diff --git a/custom_hls/lookup.hpp b/custom_hls/lookup.hpp
index 3001f6613ec6ed9a9e5f47d9be356d4b032f7192..037b038a09a10ff2bd066740d20f0b47489e24e4 100644
--- a/custom_hls/lookup.hpp
+++ b/custom_hls/lookup.hpp
@@ -26,14 +26,15 @@
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- *******************************************************************************/
+*******************************************************************************/
+#ifndef LOOKUP_HPP
+#define LOOKUP_HPP
 
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#ifndef LOOKUP_HPP
-#define LOOKUP_HPP
+#include "utils.hpp"
+
 
 template <
     unsigned NumEmbeddings,
@@ -57,4 +58,50 @@ void StreamingLookup(
     }
 }
 
+/**
+ * Lookup implementation over a table stored in AXI-accessible memory.
+ */
+template <
+	unsigned  EmbeddingSize,                            // Number of memory words per embedding
+	unsigned  EmbeddingAlign = clog2(EmbeddingSize),    // Alignment of entries = number of word index bits
+	typename  T_SRC,
+	typename  T_DST
+>
+void StreamingLookup_ext(
+	hls::stream<T_SRC> &in0,
+	hls::stream<T_DST> &out,
+	T_DST const *const  mem,
+	unsigned  const     size,
+	unsigned           &oob_count,
+	bool               &oob_irq
+) {
+#pragma HLS pipeline II=EmbeddingSize+9 style=flp
+
+	static unsigned  oob_count_li;
+	static unsigned  oob_count_int;
+#pragma HLS reset variable=oob_count_li
+#pragma HLS reset variable=oob_count_int
+
+	if(oob_count != oob_count_li) {
+		oob_count_int -= oob_count_li;
+		oob_count_li   = oob_count;
+	}
+	if(!in0.empty()) {
+		T_SRC const  x = in0.read();
+
+		// Map out-of-bounds inputs to an offset of zero and increment counter
+		bool  const  oob = x >= T_SRC(size);
+		ap_uint<T_SRC::width+EmbeddingAlign> const  ofs =
+			((oob? T_SRC(0) : x), ap_uint<EmbeddingAlign>(0));
+		oob_count_int += oob;
+
+		// Stream lookup data (burst inferred)
+		for(unsigned  i = 0; i < EmbeddingSize; i++) {
+#pragma HLS pipeline II=1 style=flp
+			out.write(mem[ofs+i]);
+		}
+	}
+	oob_count =  oob_count_int;
+	oob_irq   = (oob_count_int != 0);
+}
 #endif
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index a3f40d52ef6c8a5b79f46c1bb70f83fb61218fc9..dbafba247679895bcbaf385f0d33946c3f810945 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -46,7 +46,6 @@ RUN apt-get update && \
     libsm6 \
     libxext6 \
     libxrender-dev \
-    verilator \
     nano \
     zsh \
     rsync \
@@ -62,6 +61,16 @@ RUN apt-get update && \
 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 RUN locale-gen "en_US.UTF-8"
 
+# install Verilator from source to get the right version
+RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev
+RUN git clone https://github.com/verilator/verilator
+RUN cd verilator && \
+    git checkout v4.224 && \
+    autoconf && \
+    ./configure && \
+    make -j4 && \
+    make install
+
 # install XRT
 RUN wget https://www.xilinx.com/bin/public/openDownload?filename=$XRT_DEB_VERSION.deb -O /tmp/$XRT_DEB_VERSION.deb
 RUN apt install -y /tmp/$XRT_DEB_VERSION.deb
@@ -75,7 +84,7 @@ RUN rm requirements.txt
 # extra Python package dependencies (for testing and interaction)
 RUN pip install pygments==2.4.1
 RUN pip install ipykernel==5.5.5
-RUN pip install jupyter==1.0.0
+RUN pip install jupyter==1.0.0 --ignore-installed
 RUN pip install markupsafe==2.0.1
 RUN pip install matplotlib==3.3.1 --ignore-installed
 RUN pip install pytest-dependency==0.5.1
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index f625f2b1ef722f386180a8409a9eb9e759a2f3b6..b4ad37232fa69754a86e9064d7592d7474e8617e 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -2,7 +2,7 @@
 
 : ${PYTEST_PARALLEL=auto}
 
-cd $FINN_ROOT/finn
+cd $FINN_ROOT
 # check if command line argument is empty or not present
 if [ -z $1 ]; then
   echo "Running quicktest: not (vivado or slow or board) with pytest-xdist"
diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst
index 304aa30854118e1ebd3258169ee4698a873e8689..950b601f98d14e99a00841f23894770eb0bb1569 100644
--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -16,6 +16,6 @@ Two of the Brevitas-exported ONNX variants can be ingested by FINN:
 
 To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN.
 
-At this stage we can already use the functional verification flow to simulate the model using Python, this is marked in the graphic with the dotted arrow. For more details please have look at :ref:`verification`.
+At this stage we can already use the functional verification flow to simulate the model using Python. For more details please have look at :ref:`verification`.
 
 The model can now be further processed in FINN, the next flow step is :ref:`nw_prep`.
diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst
index 12e01db5544e847a775d330929d1eea916cae74e..8c37479a28ea7c2ae76bbcce9cf5bfc53646a2cb 100644
--- a/docs/finn/command_line.rst
+++ b/docs/finn/command_line.rst
@@ -105,7 +105,7 @@ The following outputs will be generated regardless of which particular outputs a
 The other output products are controlled by the `generate_outputs` field in the
 build configuration), and are detailed below.
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.ESTIMATE_REPORTS` produces a variety of reports to estimate resource usage and performance *without* running any synthesis. This can be useful for setting up the parallelization and other hardware configuration:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.ESTIMATE_REPORTS` produces a variety of reports to estimate resource usage and performance *without* running any synthesis. This can be useful for setting up the parallelization and other hardware configuration:
 
   * ``report/estimate_layer_cycles.json`` -- cycles per layer estimation from analytical model
   * ``report/estimate_layer_resources.json`` -- resources per layer estimation from analytical model
@@ -113,31 +113,31 @@ build configuration), and are detailed below.
   * ``report/estimate_network_performance.json`` -- whole-network performance estimation from analytical model
   * ``report/op_and_param_counts.json`` -- per-layer and total number of operations and parameters (independent of parallelization)
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.STITCHED_IP`: produces a stitched Vivado IP block design that can be integrated with other FPGA designs in Vivado IPI:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.STITCHED_IP`: produces a stitched Vivado IP block design that can be integrated with other FPGA designs in Vivado IPI:
 
   * ``stitched_ip/finn_vivado_stitch_proj.xpr`` -- Vivado project (including Vivado IP Integrator block design) to generate the stitched IP
   * ``stitched_ip/ip`` -- exported Vivado IP for the stitched design
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.RTLSIM_PERFORMANCE`: measure latency and performance for the stitched IP in RTL simulation, using PyVerilator
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.RTLSIM_PERFORMANCE`: measure latency and performance for the stitched IP in RTL simulation, using PyVerilator
 
   * ``report/rtlsim_performance.json`` -- accelerator throughput and latency from RTL simulation
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.OOC_SYNTH` runs out-of-context synthesis for the stitched IP. This is useful for getting post-synthesis resource counts and achievable clock frequency without having to produce a full bitfile with DMA engines:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.OOC_SYNTH` runs out-of-context synthesis for the stitched IP. This is useful for getting post-synthesis resource counts and achievable clock frequency without having to produce a full bitfile with DMA engines:
 
   * ``report/ooc_synth_and_timing.json`` -- resources and achievable clock frequency from out-of-context synthesis
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.BITFILE` will run Vivado and/or Vitis to insert the FINN accelerator inside a shell, with DMA engines instantiated to move data to/from main memory:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.BITFILE` will run Vivado and/or Vitis to insert the FINN accelerator inside a shell, with DMA engines instantiated to move data to/from main memory:
 
   * ``bitfile/finn-accel.(bit|xclbin)`` -- generated bitfile depending on platform
   * ``report/post_synth_resources.xml`` -- FPGA resource utilization after synthesis
   * ``report/post_route_timing.rpt`` -- post-route timing report
 
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.PYNQ_DRIVER` will generate a PYNQ Python driver that can be used to interface the generated accelerator:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.PYNQ_DRIVER` will generate a PYNQ Python driver that can be used to interface the generated accelerator:
 
   * ``driver/driver.py`` -- Python driver that can be used on PYNQ on Zynq or Alveo platforms to launch the accelerator
 
-* :py:mod:`finn.builder.build_dataflow.DataflowOutputType.DEPLOYMENT_PACKAGE`:
+* :py:mod:`finn.builder.build_dataflow_config.DataflowOutputType.DEPLOYMENT_PACKAGE`:
 
   * ``deploy/`` -- deployment package folder with a bitfile and driver, ready to be copied to target hardware platform
 
@@ -153,7 +153,7 @@ and compare it against the expected output that you provide.
 
 This is achieved by setting up the following members of the build configuration:
 
-* Set ``verify_steps`` to be a list of :py:mod:`finn.builder.build_dataflow.VerificationStepType`
+* Set ``verify_steps`` to be a list of :py:mod:`finn.builder.build_dataflow_config.VerificationStepType`
   where each element in the list indicates the output of a particular step
   that will be verified. See the documentation of the ``VerificationStepType``
   for more information.
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
index b152dfef66d0eb47e086d3c5cd51174c5df52128..f9252f764c3f8297140f81d7ed42ab2da1218dae 100644
--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -12,7 +12,7 @@ Prerequisites
 
 Before starting to do development on FINN it's a good idea to start
 with understanding the basics as a user. Going through all of the
-:ref:`tutorials` is strongly recommended if you haven' already done so.
+:ref:`tutorials` is strongly recommended if you haven't already done so.
 Additionally, please review the documentation available on :ref:`internals`.
 
 Repository structure
@@ -153,7 +153,7 @@ from the FINN root directory as follows:
 
 ::
 
-  python setup.py test --addopts "-k test_brevitas_debug --pdb"
+  pytest -k test_brevitas_debug --pdb
 
 
 If you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)
diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst
index bc5c5230718bcc8dd50334cc1f20c3c84c012ca4..0a022067c38ec3bb3c793d288e0230013ca8b21c 100644
--- a/docs/finn/end_to_end_flow.rst
+++ b/docs/finn/end_to_end_flow.rst
@@ -9,7 +9,7 @@ As you can see in the picture, FINN has a high modularity and has the property t
    :scale: 50%
    :align: center
 
-The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS and Vivado IPI (orange section). There is also a section for testing and verification in software (red section) and the hardware generation and deployment on the PYNQ board (yellow section).
+The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into five sections, each of it includes several flow steps. The flow starts in top left corner with Brevitas export, followed by the preparation of the network for the Vitis HLS and Vivado IPI. There is also a section for testing and verification in software (in the cloud on the right) and the hardware generation and deployment on the PYNQ board.
 
 This example flow is covered in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
 For a more detailed overview about the different flow sections, please have a look at the corresponding pages:
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 40425c119fafdcd03292b05c7a7e71310f767239..9b3111b70eae97a3644e1de23c368bd5b09f7927 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -20,7 +20,7 @@ How do I use FINN?
 ==================
 
 We strongly recommend that you first watch one of the pre-recorded `FINN tutorial <https://www.youtube.com/watch?v=zw2aG4PhzmA&amp%3Bindex=2>`_
-videos, then follow the Jupyter notebook tutorials for `training and deploying an MLP for network intrusion detection <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/cybersecurity>`_ .
+videos, then follow the Jupyter notebook tutorials for `training and deploying an MLP for network intrusion detection <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/cybersecurity>`_ .
 You may also want to check out the other :ref:`tutorials`, and the `FINN examples repository <https://github.com/Xilinx/finn-examples>`_ .
 
 Our aim in FINN is *not* to accelerate common off-the-shelf neural networks, but instead provide you with a set of tools
@@ -28,19 +28,19 @@ to train *customized* networks and create highly-efficient FPGA implementations
 In general, the approach for using the FINN framework is as follows:
 
 1. Train your own quantized neural network (QNN) in `Brevitas <https://github.com/Xilinx/brevitas>`_. We have some `guidelines <https://bit.ly/finn-hls4ml-qat-guidelines>`_ on quantization-aware training (QAT).
-2. Export to FINN-ONNX by following `this tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/basics/1_brevitas_network_import.ipynb>`_ .
-3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial <https://github.com/Xilinx/finn/blob/master/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb>`_
+2. Export to FINN-ONNX by following `this tutorial <https://github.com/Xilinx/finn/blob/main/notebooks/basics/1_brevitas_network_import.ipynb>`_ .
+3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial <https://github.com/Xilinx/finn/blob/main/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb>`_
 4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results.
 
 Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide.
 If there are substantial differences, you will most likely have to write your own
 Python scripts that call the appropriate FINN compiler
 functions that process your design correctly, or adding new functions (including
-Vivado HLS layers)
+Vitis HLS layers)
 as required.
-The `advanced FINN tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/advanced>`_ can be useful here.
+The `advanced FINN tutorials <https://github.com/Xilinx/finn/tree/main/notebooks/advanced>`_ can be useful here.
 For custom networks, we recommend making a copy of the `BNN-PYNQ end-to-end
-Jupyter notebook tutorials <https://github.com/Xilinx/finn/tree/master/notebooks/end2end_example/bnn-pynq>`_ as a starting point, visualizing the model at intermediate
+Jupyter notebook tutorials <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example/bnn-pynq>`_ as a starting point, visualizing the model at intermediate
 steps and adding calls to new transformations as needed.
 Once you have a working flow, you can implement a command line entry for this
 by using the "advanced mode" described in the :ref:`command_line` section.
@@ -50,7 +50,8 @@ Running FINN in Docker
 FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources <https://docker-curriculum.com/>`_ to get started.
 You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well.
 If you want to use prebuilt images, read :ref:`Using a prebuilt image`.
-The ``run-docker.sh`` script that can be launched in the following modes:
+
+The above mentioned script to build and launch the FINN docker container is called `run-docker.sh <https://github.com/Xilinx/finn/blob/main/run-docker.sh>`_ . It can be launched in the following modes:
 
 Launch interactive shell
 ************************
@@ -140,10 +141,7 @@ If you are having trouble building the Docker image or need offline access, you
 
 Supported FPGA Hardware
 =======================
-**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards.
-
-.. warning::
-  In previous FINN versions (v0.4b - v0.7) we had support for `Xilinx Alveo boards <https://www.xilinx.com/products/boards-and-kits/alveo.html>`_ using PYNQ and Vitis 2020.1, see instructions below for Alveo setup that works with older versions. Please note that with the new release with Vitis 2022.1, we do only have experimental support to automatically deployment for Alveo cards.
+**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by  `PYNQ <http://www.pynq.io/>`_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards.
 
 **Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
 
@@ -181,12 +179,12 @@ On the target side:
 
 On the host side:
 
-1. Install Vitis 2020.1 and set up the ``VITIS_PATH`` environment variable to point to your installation.
+1. Install Vitis 2022.1 and set up the ``VITIS_PATH`` environment variable to point to your installation.
 2. Install Xilinx XRT. Ensure that the ``XRT_DEB_VERSION`` environment variable reflects which version of XRT you have installed.
 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)*
 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above.
 5. `Set up public key authentication <https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server>`_. Copy your private key to the ``finn/ssh_keys`` folder on the host to get password-less deployment and remote execution.
-6. Done! You can try the ``test_end2end_vitis`` tests in the FINN Docker to verify your setup, although this will take some time.
+6. Done!
 
 Vivado/Vitis license
 *********************
@@ -214,7 +212,7 @@ We also recommend running the FINN compiler on a system with sufficiently
 strong hardware:
 
 * **RAM.** Depending on your target FPGA platform, your system must have sufficient RAM to be
-  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/memory.html>`_
+  able to run Vivado/Vitis synthesis for that part. See `this page <https://www.xilinx.com/products/design-tools/vivado/vivado-ml.html#memory>`_
   for more information. For targeting Zynq and Zynq UltraScale+ parts, at least 8 GB is recommended. Larger parts may require up to 16 GB.
   For targeting Alveo parts with Vitis, at least 64 GB RAM is recommended.
 
diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst
index 2a64b87943075ff004f79c9d457136e41e27723d..a5c486935d531f7a037f3c49ead5bc7906afa831 100644
--- a/docs/finn/hw_build.rst
+++ b/docs/finn/hw_build.rst
@@ -9,14 +9,14 @@ Hardware Build and Deployment
    :align: center
 
 A model where all layers have been converted to HLS layers can be processed by
-FINN to build a bitfile and driver targeting a Zynq system or to generate a Vivado IP Integrator (IPI)
+FINN to build a bitfile and driver targeting a Zynq or Alveo system or to generate a Vivado IP Integrator (IPI)
 design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system.
 
 
 Hardware Build
 ==============
 
-Internally, the hardware build for Zynq devices consists of the following steps:
+Internally, the hardware build consists of the following steps:
 
 1. Driver generation
 2. DMA and DWC node insertion
@@ -89,9 +89,4 @@ Deployment
 Deployment and Remote Execution
 -------------------------------
 
-The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there using the *onnx_exec* function with the right *exec_mode* settings. For details please have a look at transformation :py:mod:`finn.transformation.fpgadataflow.make_deployment.DeployToPYNQ` and the execution function :py:mod:`finn.core.onnx_exec`.
-
-Throughput Test
----------------
-
-FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done by using :py:mod:`finn.core.throughput_test`. When running this function the metrics of the network are returned as dictionary.
+The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there. For more information see the description in the `end2end_example <https://github.com/Xilinx/finn/tree/main/notebooks/end2end_example>`_ Jupyter notebooks.
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 0b33affc76484d2175a336b188661550731ca1ab..add70d649c773061c5b9e1d91dcaa852dcc4cbac 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -7,7 +7,7 @@ Internals
 Intermediate Representation: QONNX and FINN-ONNX
 ================================================
 
-FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representation (IR) for neural networks. As such, almost every component inside FINN uses ONNX and its `Python API <https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md>`_, so you may want to familiarize yourself with how ONNX represents DNNs. Specifically, the `ONNX protobuf description <https://github.com/onnx/onnx/blob/master/onnx/onnx.proto>`_ (or its `human-readable documentation <https://github.com/onnx/onnx/blob/master/docs/IR.md>`_ and the `operator schemas <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_ are useful as reference documents. We also provide a Jupyter notebook that can help to get familiar with ONNX by showing how to work with a simple ONNX model in FINN, see chapter :ref:`tutorials` for details.
+FINN uses `ONNX <https://github.com/onnx/onnx>`_ as an intermediate representation (IR) for neural networks. As such, almost every component inside FINN uses ONNX and its `Python API <https://github.com/onnx/onnx/blob/main/docs/PythonAPIOverview.md>`_, so you may want to familiarize yourself with how ONNX represents DNNs. Specifically, the `ONNX protobuf description <https://github.com/onnx/onnx/blob/main/onnx/onnx.proto>`_ (or its `human-readable documentation <https://github.com/onnx/onnx/blob/main/docs/IR.md>`_ and the `operator schemas <https://github.com/onnx/onnx/blob/main/docs/Operators.md>`_ are useful as reference documents. We also provide a Jupyter notebook that can help to get familiar with ONNX by showing how to work with a simple ONNX model in FINN, see chapter :ref:`tutorials` for details.
 
 .. note:: FINN supports two specialized variants of ONNX called QONNX and FINN-ONNX, and not all ONNX graphs are supported by FINN (and vice versa).
 
@@ -137,14 +137,14 @@ ModelWrapper contains more useful functions, if you are interested please have a
 Analysis Pass
 =============
 
-An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis`.
+An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis` .
 
 .. _transformation_pass:
 
 Transformation Pass
 ===================
 
-A transformation passes changes (transforms) the given model, it gets the model in the ModelWrapper as input and returns the changed model (ModelWrapper) to the FINN flow. Additional the flag *model_was_changed* which indicates if a transformation has to be performed more than once, is returned. If you are interested in how to write a transformation pass for FINN, please take a look at the Jupyter notebook about how to write a transformation pass, see chapter :ref:`tutorials` for details. For more information about existing transformation passes in FINN, see module :py:mod:`finn.transformation`.
+A transformation passes changes (transforms) the given model, it gets the model in the ModelWrapper as input and returns the changed model (ModelWrapper) to the FINN flow. Additional the flag *model_was_changed* which indicates if a transformation has to be performed more than once, is returned. If you are interested in how to write a transformation pass for FINN, please take a look at the Jupyter notebook about how to write a transformation pass, see chapter :ref:`tutorials` for details. For more information about existing transformation passes in FINN, see module :py:mod:`finn.transformation` .
 
 .. _mem_mode:
 
@@ -167,7 +167,7 @@ The following picture shows the idea behind the "const" and "decoupled" mode.
 
 Const mode
 ----------
-In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function <https://github.com/Xilinx/finn-hlslib/blob/19fa1197c09bca24a0f77a7fa04b8d7cb5cc1c1d/mvau.hpp#L93>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
+In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function <https://github.com/Xilinx/finn-hlslib/blob/master/mvau.hpp#L92>`_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
 
 Advantages:
 
@@ -185,7 +185,7 @@ Disadvantages:
 
 Decoupled mode
 --------------
-In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU <https://github.com/Xilinx/finn-hlslib/blob/07a8353f6cdfd8bcdd81e309a5581044c2a93d3b/mvau.hpp#L213>`_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib <https://github.com/Xilinx/finn/tree/dev/finn-rtllib>`_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode.
+In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU <https://github.com/Xilinx/finn-hlslib/blob/master/mvau.hpp#L214>`_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib <https://github.com/Xilinx/finn/tree/dev/finn-rtllib>`_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode.
 
 Advantages:
 
diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst
index 566eda5bac38855e9ed8edfdf53193bb6c025256..6fea992cf70ad2cb29b385133ccdcf34606b2185 100644
--- a/docs/finn/nw_prep.rst
+++ b/docs/finn/nw_prep.rst
@@ -10,7 +10,7 @@ Network Preparation
 
 The main principle of FINN are analysis and transformation passes. If you like to have more information about these please have a look at section :ref:`analysis_pass` and :ref:`transformation_pass` or at chapter :ref:`tutorials` about the provided Jupyter notebooks.
 
-This page is about the network preparation, the flow step that comes after the :ref:`brevitas_export`. Its main idea is to optimize the network and convert the nodes to custom nodes that correspond to `finn-hlslib <https://github.com/Xilinx/finn-hlslib>`_ functions. In this way we get a network that we can bring to hardware with the help of Vivado. For that we have to apply several transformations on the ONNX model, which this flow step receives wrapped in the :ref:`modelwrapper`.
+This page is about the network preparation, the flow step that comes after the :ref:`brevitas_export`. Its main idea is to optimize the network and convert the nodes to custom nodes that correspond to `finn-hlslib <https://github.com/Xilinx/finn-hlslib>`_ functions. In this way we get a network that we can bring to hardware with the help of Vitis and Vivado. For that we have to apply several transformations on the ONNX model, which this flow step receives wrapped in the :ref:`modelwrapper`.
 
 Various transformations are involved in the network preparation. The following is a short overview of these.
 
diff --git a/docs/finn/source_code/finn.analysis.fpgadataflow.rst b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
index b52e994ee6033d4c3c1aae6400e20e103455d7b6..57472cb670b6fa6cb95e6c137458d3a522f82f5a 100644
--- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
@@ -30,6 +30,7 @@ finn.analysis.fpgadataflow.floorplan\_params
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
 -------------------------------------------------------------
 
@@ -38,14 +39,15 @@ finn.analysis.fpgadataflow.hls\_synth\_res\_estimation
    :undoc-members:
    :show-inheritance:
 
- finn.analysis.fpgadataflow.op\_and\_param\_counts
- --------------------------------------------------
+finn.analysis.fpgadataflow.op\_and\_param\_counts
+--------------------------------------------------
 
- .. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
+.. automodule:: finn.analysis.fpgadataflow.op_and_param_counts
     :members:
     :undoc-members:
     :show-inheritance:
 
+
 finn.analysis.fpgadataflow.post\_synth\_res
 --------------------------------------------------
 
@@ -54,6 +56,7 @@ finn.analysis.fpgadataflow.post\_synth\_res
    :undoc-members:
    :show-inheritance:
 
+
 finn.analysis.fpgadataflow.res\_estimation
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.builder.rst b/docs/finn/source_code/finn.builder.rst
index 2433cab83d1aa140010f4082ec8323bdaa8c6ff4..caadf3f91f7c9aa06f04be356e9c3594fc208d2d 100644
--- a/docs/finn/source_code/finn.builder.rst
+++ b/docs/finn/source_code/finn.builder.rst
@@ -9,9 +9,9 @@ finn.builder.build\_dataflow
 ----------------------------
 
 .. automodule:: finn.builder.build_dataflow
-   :members:
-   :undoc-members:
-   :show-inheritance:
+ :members:
+ :undoc-members:
+ :show-inheritance:
 
 finn.builder.build\_dataflow\_config
 ------------------------------------
@@ -26,6 +26,6 @@ finn.builder.build\_dataflow\_steps
 ------------------------------------
 
 .. automodule:: finn.builder.build_dataflow_steps
-  :members:
-  :undoc-members:
-  :show-inheritance:
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 4e3de458e153871d1d5969442af5940dc1673ecd..afa1ecffa08213db6a282076c6fdf59694f9e13e 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -37,6 +37,15 @@ qonnx.core.modelwrapper
    :undoc-members:
    :show-inheritance:
 
+qonnx.core.onnx\_exec
+---------------------------
+
+.. automodule:: qonnx.core.onnx_exec
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.core.onnx\_exec
 ---------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index cc56ea603e589d7000fe5b2b2943e67cdb90c884..fdcf44c6d99561658b727dc64c0a1b98b247c7df 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -8,7 +8,7 @@ HLS Custom Op Nodes
 Base Class
 ----------
 
-.. automodule:: finn.custom_op.fpgadataflow
+.. automodule:: finn.custom_op.fpgadataflow.hlscustomop
    :members:
    :undoc-members:
    :show-inheritance:
@@ -29,9 +29,25 @@ finn.custom\_op.fpgadataflow.channelwise\_op\_batch
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.checksum
+--------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.checksum
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.fpgadataflow.concat
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.concat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 
 finn.custom\_op.fpgadataflow.convolutioninputgenerator
--------------------------------------------------------------
+--------------------------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator
    :members:
@@ -46,6 +62,15 @@ finn.custom\_op.fpgadataflow.convolutioninputgenerator1d
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl
+------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.downsampler
 -----------------------------------------
 
@@ -62,6 +87,16 @@ finn.custom\_op.fpgadataflow.duplicatestreams\_batch
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.fpgadataflow.eltwise
+-------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.eltwise
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.fmpadding\_batch
 -----------------------------------------------
 
@@ -79,7 +114,7 @@ finn.custom\_op.fpgadataflow.globalaccpool\_batch
    :show-inheritance:
 
 finn.custom\_op.fpgadataflow.iodma
------------------------------------------------
+------------------------------------
 
 .. automodule:: finn.custom_op.fpgadataflow.iodma
    :members:
@@ -102,6 +137,15 @@ finn.custom\_op.fpgadataflow.lookup
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.matrixvectoractivation
+-----------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.pool\_batch
 -----------------------------------------------
 
@@ -127,14 +171,6 @@ finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
    :undoc-members:
    :show-inheritance:
 
-finn.custom\_op.fpgadataflow.matrixvectoractivation
------------------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.custom\_op.fpgadataflow.streamingfifo
 -------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index 20d90a7bb596d6ce5638d9b2d9bae8a5c7e5c723..cdbe957c713ef6916e4ed7baabe09135f71fdeef 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -9,6 +9,7 @@ Submodules
    :maxdepth: 2
 
    finn.custom_op.fpgadataflow
+   qonnx.custom_op.channels_last
    qonnx.custom_op.general
 
 Custom Op Nodes
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index b1e7075bdcfb675a894f3e66b61d59117e4f078d..9f8ec079309f16daa022e14317ebddfd7758d639 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -62,6 +62,14 @@ finn.transformation.fpgadataflow.create\_stitched\_ip
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.derive\_characteristic
+------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.derive_characteristic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.fpgadataflow.externalize\_params
 ------------------------------------------------------------
 
@@ -103,6 +111,17 @@ finn.transformation.fpgadataflow.insert\_fifo
    :undoc-members:
    :show-inheritance:
 
+
+finn.transformation.fpgadataflow.insert\_hook
+----------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.insert_hook
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+
 finn.transformation.fpgadataflow.insert\_iodma
 ----------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index 6a28eeedb2aa547ba80677864ae9fb8c6aa64097..f42b595a50ec90ef055e2818d66f4b2410c25594 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -20,7 +20,7 @@ Transformation Passes
 Base Class
 ----------
 
-.. automodule:: finn.transformation
+.. automodule:: qonnx.transformation.base
    :members:
    :undoc-members:
    :show-inheritance:
@@ -42,7 +42,7 @@ qonnx.transformation.bipolar\_to\_xnor
    :show-inheritance:
 
 qonnx.transformation.change\_3d\_tensors\_to\_4d
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.change_3d_tensors_to_4d
   :members:
@@ -57,8 +57,18 @@ qonnx.transformation.change\_datalayout
   :undoc-members:
   :show-inheritance:
 
+
+qonnx.transformation.channels\_last
+--------------------------------------------
+
+.. automodule:: qonnx.transformation.channels_last
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 qonnx.transformation.create\_generic\_partitions
-------------------------------------------------
+-------------------------------------------------
 
 .. automodule:: qonnx.transformation.create_generic_partitions
   :members:
@@ -171,13 +181,22 @@ qonnx.transformation.merge\_onnx\_models
   :show-inheritance:
 
 
-finn.transformation.move\_reshape
+qonnx.transformation.quant\_constant\_folding
+----------------------------------------------
+
+.. automodule:: qonnx.transformation.quant_constant_folding
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
+qonnx.transformation.rebalance\_conv
 ----------------------------------------
 
-.. automodule:: finn.transformation.move_reshape
-   :members:
-   :undoc-members:
-   :show-inheritance:
+.. automodule:: qonnx.transformation.rebalance_conv
+  :members:
+  :undoc-members:
+  :show-inheritance:
 
 qonnx.transformation.remove
 -------------------------------------
@@ -186,3 +205,12 @@ qonnx.transformation.remove
   :members:
   :undoc-members:
   :show-inheritance:
+
+
+finn.transformation.move\_reshape
+----------------------------------------
+
+.. automodule:: finn.transformation.move_reshape
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 8dffa016327c3bbe50f21278c859c83556b2b213..7ba3b252abfa0086a8c0281eb9a792fb239d6ec3 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -14,6 +14,15 @@ qonnx.util.basic
    :show-inheritance:
 
 
+qonnx.util.cleanup
+----------------------
+
+.. automodule:: qonnx.util.cleanup
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 qonnx.util.config
 --------------------
 
@@ -22,6 +31,40 @@ qonnx.util.config
   :undoc-members:
   :show-inheritance:
 
+qonnx.util.exec\_qonnx
+----------------------
+
+.. automodule:: qonnx.util.exec_qonnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.inference\_cost
+--------------------------
+
+.. automodule:: qonnx.util.inference_cost
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+qonnx.util.onnx
+-------------------
+
+.. automodule:: qonnx.util.onnx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.util.to\_channels\_last
+------------------------------
+
+.. automodule:: qonnx.util.to_channels_last
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.util.basic
 ----------------------
 
@@ -64,6 +107,15 @@ finn.util.gdrive
   :undoc-members:
   :show-inheritance:
 
+finn.util.hls
+---------------
+
+.. automodule:: finn.util.hls
+  :members:
+  :undoc-members:
+  :show-inheritance:
+
+
 finn.util.imagenet
 -----------------------------
 
@@ -72,14 +124,6 @@ finn.util.imagenet
   :undoc-members:
   :show-inheritance:
 
-qonnx.util.onnx
----------------------
-
-.. automodule:: qonnx.util.onnx
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finn.util.platforms
 --------------------
 
diff --git a/docs/finn/source_code/modules.rst b/docs/finn/source_code/modules.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/finn/source_code/qonnx.custom_op.channels_last.rst b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ad10d94a6b34a99e2213994a75b0f063fd3d36f
--- /dev/null
+++ b/docs/finn/source_code/qonnx.custom_op.channels_last.rst
@@ -0,0 +1,41 @@
+**************************
+Custom Op - Channels Last
+**************************
+
+Channels Last Custom Ops
+=========================
+
+qonnx.custom\_op.channels\_last.base\_wrapped\_op
+--------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.base_wrapped_op
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.batch\_normalization
+------------------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.batch_normalization
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.conv
+--------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.conv
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+qonnx.custom\_op.channels\_last.max\_pool
+------------------------------------------
+
+.. automodule:: qonnx.custom_op.channels_last.max_pool
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst
index 110f77c5b10d2415ac2d2ff7b716567ec5cb76fa..7ac54501cf22a0b123b7b3d156a6a437e8045f22 100644
--- a/docs/finn/tutorials.rst
+++ b/docs/finn/tutorials.rst
@@ -46,3 +46,8 @@ The notebooks in this folder are more developer oriented. They should help you t
 * 2_custom_op
 
   * Explains the basics of FINN custom ops and how to define a new one.
+
+FINN Example FPGA Flow Using MNIST Numerals
+============================================
+
+Next to the Jupyter notebooks above there is a tutorial about the command-line build_dataflow `here <https://github.com/Xilinx/finn/tree/main/tutorials/fpga_flow>`_ which shows how to bring a FINN compiled model into the Vivado FPGA design environment.
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 2dd5e519342c01ebe5c3af72784471a537b216b2..5b060f5bc83b9d9709fc7e34e855543b4690af5a 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,15 +27,16 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="398a0ecfcb32407c0a3df39246cf6d2bca02886c"
+QONNX_COMMIT="dd35a8ff49d7225a07ffceeebe25a6361df48349"
 FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
 BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
-PYVERILATOR_COMMIT="64b8294ff1afebb47be76fcad6ae87027e0402c2"
+PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="e9946e5e56acd85837e8e79224d2bb60764bed69"
+HLSLIB_COMMIT="d27f6b6c5d8f1bb208db395659389603f63ad4be"
 OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
+KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79"
 EXP_BOARD_FILES_MD5="30eecc497c31050bd46d10ea20eba232"
 
 QONNX_URL="https://github.com/fastmachinelearning/qonnx.git"
@@ -47,6 +48,7 @@ HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
 OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
 AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
 XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
+KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
 
 QONNX_DIR="qonnx"
 FINN_EXP_DIR="finn-experimental"
@@ -57,6 +59,7 @@ HLSLIB_DIR="finn-hlslib"
 OMX_DIR="oh-my-xilinx"
 AVNET_BDF_DIR="avnet-bdf"
 XIL_BDF_DIR="xil-bdf"
+KV260_SOM_BDF_DIR="kv260-som-bdf"
 
 # absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
@@ -104,6 +107,7 @@ fetch_board_files() {
     unzip -q pynq-z2.zip
     cp -r $SCRIPTPATH/deps/$AVNET_BDF_DIR/* $SCRIPTPATH/deps/board_files/
     cp -r $SCRIPTPATH/deps/$XIL_BDF_DIR/boards/Xilinx/rfsoc2x2 $SCRIPTPATH/deps/board_files/;
+    cp -r $SCRIPTPATH/deps/$KV260_SOM_BDF_DIR/boards/Xilinx/kv260_som $SCRIPTPATH/deps/board_files/;
     cd $OLD_PWD
 }
 
@@ -116,6 +120,7 @@ fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
 fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR
 fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR
 fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR
+fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR
 
 # download extra Pynq board files and extract if needed
 if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then
diff --git a/finn-rtllib/fmpadding/hdl/axi2we.sv b/finn-rtllib/fmpadding/hdl/axi2we.sv
new file mode 100644
index 0000000000000000000000000000000000000000..842ba3632c4224d58f87c66e1affc4c028b60ef3
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/axi2we.sv
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	AXI-Light adapter for trivial write enable interface.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module axi2we #(
+	int unsigned  ADDR_BITS
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	                 s_axilite_AWVALID,
+	output	                 s_axilite_AWREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading tied to all-ones
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[ADDR_BITS-1:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	// Write Enable Interface
+	output	logic                  we,
+	output	logic [ADDR_BITS-1:0]  wa,
+	output	logic [         31:0]  wd
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+
+	logic  WABusy = 0;
+	logic  WDBusy = 0;
+	logic [ADDR_BITS-1:0]  Addr = 'x;
+	logic [         31:0]  Data = 'x;
+
+	assign	we = WABusy && WDBusy && s_axilite_BREADY;
+	assign	wa = Addr;
+	assign	wd = Data;
+
+	uwire  clr_wr = rst || we;
+	always_ff @(posedge clk) begin
+		if(clr_wr) begin
+			WABusy <= 0;
+			Addr <= 'x;
+			WDBusy <= 0;
+			Data <= 'x;
+		end
+		else begin
+			if(!WABusy) begin
+				WABusy <= s_axilite_AWVALID;
+				Addr   <= s_axilite_AWADDR;
+			end
+			if(!WDBusy) begin
+				WDBusy <= s_axilite_WVALID;
+				Data   <= s_axilite_WDATA;
+			end
+		end
+	end
+	assign	s_axilite_AWREADY = !WABusy;
+	assign	s_axilite_WREADY  = !WDBusy;
+	assign	s_axilite_BVALID  = WABusy && WDBusy;
+	assign	s_axilite_BRESP   = '0; // OK
+
+	// Answer all reads with '1
+	logic  RValid =  0;
+	uwire  clr_rd = rst || (RValid && s_axilite_RREADY);
+	always_ff @(posedge clk) begin
+		if(clr_rd)        RValid <=  0;
+		else if(!RValid)  RValid <= s_axilite_ARVALID;
+	end
+	assign	s_axilite_ARREADY = !RValid;
+	assign	s_axilite_RVALID  = RValid;
+	assign	s_axilite_RDATA   = '1;
+	assign	s_axilite_RRESP   = '0; // OK
+
+endmodule : axi2we
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding.sv b/finn-rtllib/fmpadding/hdl/fmpadding.sv
new file mode 100644
index 0000000000000000000000000000000000000000..904c7c381f7b2499fc354ebf798e86edab262866
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding.sv
@@ -0,0 +1,224 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	// Parameter Configuration ----------
+	input	logic         we,
+	input	logic [ 4:0]  wa,
+	input	logic [31:0]  wd,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	uwire  clk = ap_clk;
+	uwire  rst = !ap_rst_n;
+
+	//-----------------------------------------------------------------------
+	// Parameter Sanity Checking
+	initial begin
+		automatic bit  fail = 0;
+
+		if(XCOUNTER_BITS < $clog2(1+INIT_XEND)) begin
+			$error("XCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XON)) begin
+			$error("XCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(XCOUNTER_BITS < $clog2(1+INIT_XOFF)) begin
+			$error("XCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YEND)) begin
+			$error("YCounter size too small to accommodate end count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YON)) begin
+			$error("YCounter size too small to accommodate ON count.");
+			fail = 1;
+		end
+		if(YCOUNTER_BITS < $clog2(1+INIT_YOFF)) begin
+			$error("YCounter size too small to accommodate OFF count.");
+			fail = 1;
+		end
+
+		if((INIT_XEND < INIT_XON) || (INIT_XOFF <= INIT_XON)) begin
+			$warning("Initial empty X output range.");
+		end
+		if((INIT_YEND < INIT_YON) || (INIT_YOFF <= INIT_YON)) begin
+			$warning("Initial empty Y output range.");
+		end
+
+		if(fail)  $finish();
+	end
+
+	//-----------------------------------------------------------------------
+	// Dynamically configurable state
+	typedef logic [XCOUNTER_BITS-1:0]  xcount_t;
+	xcount_t  XEnd = INIT_XEND;
+	xcount_t  XOn  = INIT_XON;
+	xcount_t  XOff = INIT_XOFF;
+
+	typedef logic [YCOUNTER_BITS-1:0]  ycount_t;
+	ycount_t  YEnd = INIT_YEND;
+	ycount_t  YOn  = INIT_YON;
+	ycount_t  YOff = INIT_YOFF;
+
+	always_ff @(posedge clk) begin
+		if(we) begin
+			unique case(wa)
+			0*4:  XOn  <= wd;
+			1*4:  XOff <= wd;
+			2*4:  XEnd <= wd;
+			3*4:  YOn  <= wd;
+			4*4:  YOff <= wd;
+			5*4:  YEnd <= wd;
+
+			default:  assert(0) else begin
+				$error("Illegal write address.");
+				$stop;
+			end
+			endcase
+		end
+	end
+
+	//-----------------------------------------------------------------------
+	// Cascaded enables for the nested counters: SCount, XCount, YCount
+	uwire  sen;
+	uwire  xen;
+	uwire  yen;
+
+	//- S-Counter: SIMD fold ------------
+	initial begin
+		if((NUM_CHANNELS < 1) || (NUM_CHANNELS % SIMD != 0)) begin
+			$error("Channel count must be SIMD multiple.");
+			$finish;
+		end
+	end
+	// Count SF-2, SF-3, ..., 1, 0, -1
+	localparam int unsigned  SF = NUM_CHANNELS/SIMD;
+	typedef logic [$clog2(SF-1):0]  scount_t;
+	scount_t  SCount = SF-2;
+
+	assign	xen = sen && SCount[$left(SCount)];
+	uwire  sclr = rst || xen;
+	always_ff @(posedge clk) begin
+		if(sclr)      SCount <= SF-2;
+		else if(sen)  SCount <= SCount - 1;
+	end
+
+	//- X-Counter: image width ----------
+	xcount_t  XCount = 0;
+
+	assign	yen = xen && (XCount == XEnd);
+	uwire  xclr = rst || yen;
+	always_ff @(posedge clk) begin
+		if(xclr)      XCount <= 0;
+		else if(xen)  XCount <= XCount + 1;
+	end
+	uwire  xfwd = (XOn <= XCount) && (XCount < XOff);
+
+	//- Y-Counter: image height ---------
+	ycount_t  YCount = 0;
+
+	uwire  yclr = rst || (yen && (YCount == YEnd));
+	always_ff @(posedge clk) begin
+		if(yclr)      YCount <= 0;
+		else if(yen)  YCount <= YCount + 1;
+	end
+	uwire  yfwd = (YOn <= YCount) && (YCount < YOff);
+
+	//-----------------------------------------------------------------------
+	// Input forwarding and edge padding
+	typedef struct {
+		logic  vld;
+		logic [STREAM_BITS-1:0]  dat;
+	} buf_t;
+	buf_t  A = '{ vld: 0, dat: 'x };
+	buf_t  B = '{ vld: 0, dat: 'x };
+
+	uwire  fwd = xfwd && yfwd;
+	assign	sen = (m_axis_tready || !B.vld) && (s_axis_tvalid || A.vld || !fwd);
+	assign	s_axis_tready = !A.vld;
+	assign	m_axis_tvalid =  B.vld;
+	assign	m_axis_tdata  =  B.dat;
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			B <= '{ vld: 0, dat: 'x };
+		end
+		else if(m_axis_tready || !B.vld) begin
+			B.vld <= s_axis_tvalid || A.vld || !fwd;
+			B.dat <= !fwd? '0 : A.vld? A.dat : s_axis_tdata;
+		end
+	end
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			A <= '{ vld: 0, dat: 'x };
+		end
+		else begin
+			A.vld <= (A.vld || s_axis_tvalid) && ((B.vld && !m_axis_tready) || !fwd);
+			if(!A.vld)  A.dat <= s_axis_tdata;
+		end
+	end
+
+endmodule : fmpadding
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv b/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
new file mode 100644
index 0000000000000000000000000000000000000000..5948341d000a1dd82ff363b36557f897d3a064c7
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi.sv
@@ -0,0 +1,123 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Feature map padding.
+ * @author	Thomas B. Preußer <tpreusse@amd.com>
+ *****************************************************************************/
+
+module fmpadding_axi #(
+	int unsigned  XCOUNTER_BITS,
+	int unsigned  YCOUNTER_BITS,
+	int unsigned  NUM_CHANNELS,
+	int unsigned  SIMD,
+	int unsigned  ELEM_BITS,
+	int unsigned  INIT_XON,
+	int unsigned  INIT_XOFF,
+	int unsigned  INIT_XEND,
+	int unsigned  INIT_YON,
+	int unsigned  INIT_YOFF,
+	int unsigned  INIT_YEND,
+
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8)
+)(
+	//- Global Control ------------------
+	input	logic  ap_clk,
+	input	logic  ap_rst_n,
+
+	//- AXI Lite ------------------------
+	// Writing
+	input	       s_axilite_AWVALID,
+	output	       s_axilite_AWREADY,
+	input	[4:0]  s_axilite_AWADDR,
+
+	input	        s_axilite_WVALID,
+	output	        s_axilite_WREADY,
+	input	[31:0]  s_axilite_WDATA,
+	input	[ 3:0]  s_axilite_WSTRB,
+
+	output	       s_axilite_BVALID,
+	input	       s_axilite_BREADY,
+	output	[1:0]  s_axilite_BRESP,
+
+	// Reading
+	input	       s_axilite_ARVALID,
+	output	       s_axilite_ARREADY,
+	input	[4:0]  s_axilite_ARADDR,
+
+	output	        s_axilite_RVALID,
+	input	        s_axilite_RREADY,
+	output	[31:0]  s_axilite_RDATA,
+	output	[ 1:0]  s_axilite_RRESP,
+
+	//- AXI Stream - Input --------------
+	output	logic  s_axis_tready,
+	input	logic  s_axis_tvalid,
+	input	logic [STREAM_BITS-1:0]  s_axis_tdata,
+
+	//- AXI Stream - Output -------------
+	input	logic  m_axis_tready,
+	output	logic  m_axis_tvalid,
+	output	logic [STREAM_BITS-1:0]  m_axis_tdata
+);
+
+	// AXI-Lite Adapter
+	uwire         we;
+	uwire [ 4:0]  wa;
+	uwire [31:0]  wd;
+	axi2we #(.ADDR_BITS(5)) axilight_adapter (
+		.ap_clk, .ap_rst_n,
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB,
+		.s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP,
+
+		.s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR,
+		.s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP,
+
+		.we, .wa, .wd
+	);
+
+	// Actual Padding
+	fmpadding #(
+		.XCOUNTER_BITS(XCOUNTER_BITS), .YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS), .SIMD(SIMD),
+		.INIT_XON(INIT_XON), .INIT_XOFF(INIT_XOFF), .INIT_XEND(INIT_XEND),
+		.INIT_YON(INIT_YON), .INIT_YOFF(INIT_YOFF), .INIT_YEND(INIT_YEND),
+		.ELEM_BITS(ELEM_BITS)
+	) padding (
+		.ap_clk, .ap_rst_n,
+
+		.we, .wa, .wd,
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+endmodule : fmpadding_axi
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv b/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
new file mode 100644
index 0000000000000000000000000000000000000000..741689b3a7af7ad4d07f2af569f71135c1d35c7b
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_axi_tb.sv
@@ -0,0 +1,154 @@
+
+module fmpadding_axi_tb #(
+	int unsigned  XCOUNTER_BITS = 8,
+	int unsigned  YCOUNTER_BITS = 8,
+	int unsigned  NUM_CHANNELS  = 4,
+	int unsigned  SIMD          = 2,
+	int unsigned  ELEM_BITS     = 4
+)();
+	localparam int unsigned  STREAM_BITS = 8*(1 + (SIMD*ELEM_BITS-1)/8);
+
+	//- Global Control ------------------
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst;
+
+	// AXI-Light for Parameter Configuration
+	logic	       s_axilite_AWVALID;
+	uwire	       s_axilite_AWREADY;
+	logic	[2:0]  s_axilite_AWADDR;
+
+	logic	        s_axilite_WVALID;
+	uwire	        s_axilite_WREADY;
+	logic	[31:0]  s_axilite_WDATA;
+
+	//- AXI Stream - Input --------------
+	uwire  s_axis_tready;
+	logic  s_axis_tvalid;
+	logic [STREAM_BITS-1:0]  s_axis_tdata;
+
+	//- AXI Stream - Output -------------
+	logic  m_axis_tready;
+	uwire  m_axis_tvalid;
+	uwire [STREAM_BITS-1:0]  m_axis_tdata;
+
+
+	// DUT
+	fmpadding_axi #(
+		.XCOUNTER_BITS(XCOUNTER_BITS),
+		.YCOUNTER_BITS(YCOUNTER_BITS),
+		.NUM_CHANNELS(NUM_CHANNELS),
+		.SIMD(SIMD),
+		.INIT_XON(0), .INIT_XOFF(0), .INIT_XEND(0),
+		.INIT_YON(0), .INIT_YOFF(0), .INIT_YEND(0),
+		.ELEM_BITS(ELEM_BITS)
+	) dut (
+		.ap_clk(clk), .ap_rst_n(!rst),
+
+		.s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+		.s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1),
+		.s_axilite_BVALID(), .s_axilite_BREADY('1),	.s_axilite_BRESP(),
+		.s_axilite_ARVALID('0), .s_axilite_ARREADY(), .s_axilite_ARADDR('x),
+		.s_axilite_RVALID(), .s_axilite_RREADY('0), .s_axilite_RDATA(), .s_axilite_RRESP(),
+
+		.s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+		.m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+	);
+
+	// Stimuli
+	localparam int unsigned  IMAGES = 2;
+	localparam int unsigned  XSIZE = 10;
+	localparam int unsigned  YSIZE =  7;
+	localparam int unsigned  PAD_LEFT   = 2;
+	localparam int unsigned  PAD_RIGHT  = 3;
+	localparam int unsigned  PAD_TOP    = 1;
+	localparam int unsigned  PAD_BOTTOM = 2;
+
+	task axi_write(input logic [2:0]  wa, input logic [31:0]  wd);
+		s_axilite_AWVALID <= 1;
+		s_axilite_AWADDR <= wa;
+		@(posedge clk iff s_axilite_AWREADY);
+		s_axilite_AWVALID <= 0;
+		s_axilite_AWADDR <= 'x;
+
+		s_axilite_WVALID <= 1;
+		s_axilite_WDATA <= wd;
+		@(posedge clk iff s_axilite_WREADY);
+		s_axilite_WVALID <= 0;
+		s_axilite_WDATA <= 'x;
+	endtask : axi_write
+
+
+	initial begin
+		s_axilite_AWVALID = 0;
+		s_axilite_AWADDR = 'x;
+		s_axilite_WVALID = 0;
+		s_axilite_WDATA = 'x;
+
+		s_axis_tvalid =  0;
+		s_axis_tdata  = 'x;
+
+		// Configure Parameters
+		rst = 0;
+		@(posedge clk);
+		/* XOn  */	axi_write(0, PAD_LEFT);
+		/* XOff */	axi_write(1, XSIZE - PAD_RIGHT);
+		/* XEnd */	axi_write(2, XSIZE - 1);
+		/* YOn  */	axi_write(4, PAD_TOP);
+		/* YOff */	axi_write(5, YSIZE - PAD_BOTTOM);
+		/* YEnd */	axi_write(6, YSIZE - 1);
+		@(posedge clk);
+		rst <= 1;
+		@(posedge clk);
+		rst <= 0;
+		@(posedge clk);
+
+		// Feed data input
+		s_axis_tvalid <= 1;
+		for(int unsigned  i = 0; i < IMAGES * (XSIZE-PAD_LEFT-PAD_RIGHT) * (YSIZE-PAD_TOP-PAD_BOTTOM) * (NUM_CHANNELS/SIMD); i++) begin
+			s_axis_tdata  <= i;
+			@(posedge clk iff s_axis_tready);
+			if($urandom()%5 == 0) begin
+				s_axis_tvalid <=  0;
+				s_axis_tdata  <= 'x;
+				@(posedge clk);
+				s_axis_tvalid <=  1;
+			end
+		end
+		s_axis_tvalid <=  0;
+		s_axis_tdata  <= 'x;
+	end
+
+	// Output Throttler
+	initial begin
+		m_axis_tready =  0;
+		@(posedge clk iff !rst);
+		m_axis_tready <= 1;
+		forever @(posedge clk iff m_axis_tvalid) begin
+			m_axis_tready <= 0;
+			repeat(4-$clog2(1+$urandom()%15)) @(posedge clk);
+			m_axis_tready <= 1;
+		end
+	end
+
+	// Output logger
+	initial begin
+		@(negedge rst);
+		repeat(IMAGES) begin
+			for(int unsigned  y = 0; y < YSIZE; y++) begin
+				for(int unsigned  x = 0; x < XSIZE; x++) begin
+					automatic string  delim = " ";
+					for(int unsigned  s = 0; s < NUM_CHANNELS/SIMD; s++) begin
+						@(posedge clk iff m_axis_tvalid && m_axis_tready);
+						$write("%s%02X", delim, m_axis_tdata);
+						delim = ":";
+					end
+				end
+				$display();
+			end
+			$display("----");
+		end
+		$finish;
+	end
+
+endmodule : fmpadding_axi_tb
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_template.v b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
new file mode 100644
index 0000000000000000000000000000000000000000..0b0f40f86a44ac1d905c89bed5328d6d1ea48876
--- /dev/null
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$(
+//- Global Control ------------------
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_clk,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+input	ap_rst_n,
+
+//- AXI Lite ------------------------
+// Writing
+input	       s_axilite_AWVALID,
+output	       s_axilite_AWREADY,
+input	[4:0]  s_axilite_AWADDR,
+
+input	        s_axilite_WVALID,
+output	        s_axilite_WREADY,
+input	[31:0]  s_axilite_WDATA,
+input	[ 3:0]  s_axilite_WSTRB,
+
+output	       s_axilite_BVALID,
+input	       s_axilite_BREADY,
+output	[1:0]  s_axilite_BRESP,
+
+// Reading
+input	       s_axilite_ARVALID,
+output	       s_axilite_ARREADY,
+input	[4:0]  s_axilite_ARADDR,
+
+output	        s_axilite_RVALID,
+input	        s_axilite_RREADY,
+output	[31:0]  s_axilite_RDATA,
+output	[ 1:0]  s_axilite_RRESP,
+
+//- AXI Stream - Input --------------
+output	in0_V_TREADY,
+input	in0_V_TVALID,
+input	[$STREAM_BITS$-1:0]  in0_V_TDATA,
+
+//- AXI Stream - Output -------------
+input	out_V_TREADY,
+output	out_V_TVALID,
+output	[$STREAM_BITS$-1:0]  out_V_TDATA
+);
+
+
+fmpadding_axi #(
+.XCOUNTER_BITS($XCOUNTER_BITS$),
+.YCOUNTER_BITS($YCOUNTER_BITS$),
+.NUM_CHANNELS($NUM_CHANNELS$),
+.SIMD($SIMD$),
+.ELEM_BITS($ELEM_BITS$),
+.INIT_XON($INIT_XON$),
+.INIT_XOFF($INIT_XOFF$),
+.INIT_XEND($INIT_XEND$),
+.INIT_YON($INIT_YON$),
+.INIT_YOFF($INIT_YOFF$),
+.INIT_YEND($INIT_YEND$)
+)
+$TOP_MODULE_NAME$_impl
+(
+ .ap_clk(ap_clk),
+ .ap_rst_n(ap_rst_n),
+ .s_axilite_AWVALID(s_axilite_AWVALID),
+ .s_axilite_AWREADY(s_axilite_AWREADY),
+ .s_axilite_AWADDR(s_axilite_AWADDR),
+ .s_axilite_WVALID(s_axilite_WVALID),
+ .s_axilite_WREADY(s_axilite_WREADY),
+ .s_axilite_WDATA(s_axilite_WDATA),
+ .s_axilite_WSTRB(s_axilite_WSTRB),
+ .s_axilite_BVALID(s_axilite_BVALID),
+ .s_axilite_BREADY(s_axilite_BREADY),
+ .s_axilite_BRESP(s_axilite_BRESP),
+ .s_axilite_ARVALID(s_axilite_ARVALID),
+ .s_axilite_ARREADY(s_axilite_ARREADY),
+ .s_axilite_ARADDR(s_axilite_ARADDR),
+ .s_axilite_RVALID(s_axilite_RVALID),
+ .s_axilite_RREADY(s_axilite_RREADY),
+ .s_axilite_RDATA(s_axilite_RDATA),
+ .s_axilite_RRESP(s_axilite_RRESP),
+ .s_axis_tready(in0_V_TREADY),
+ .s_axis_tvalid(in0_V_TVALID),
+ .s_axis_tdata(in0_V_TDATA),
+ .m_axis_tready(out_V_TREADY),
+ .m_axis_tvalid(out_V_TVALID),
+ .m_axis_tdata(out_V_TDATA)
+);
+
+endmodule
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
index b4e89628a44bb1f55c3445ee8e6866beada23585..11cef604e0a3d106529a65ae229bc4cb419c4d70 100644
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -69,7 +69,7 @@
 `define Q_srl
 
 
-module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
+module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 
    parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
    parameter width = 16;   // - width of data (i_d, o_d)
@@ -90,7 +90,9 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    wire               o_b;	// - output stream back-pressure
 
    output [addrwidth:0] count;  // - output number of elems in queue
+   output [addrwidth:0] maxcount;  // - maximum observed count since reset
 
+   reg [addrwidth:0] maxcount_reg;  // - maximum count seen until now
    reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
 							//     for data output
    reg 			  shift_en_;			// - SRL16 shift enable
@@ -124,6 +126,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
    assign o_d = srlo;				// - output data from queue
    assign o_v = o_v_reg;			// - output valid if non-empty
    assign i_b = i_b_reg;			// - input bp if full
+   assign maxcount = maxcount_reg;
 
    assign i_r = !i_b;
    assign o_b = !o_r;
@@ -139,7 +142,10 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
 	 addr      <= 0;
          addr_full <= 0;
 	 o_v_reg   <= 0;
-	 i_b_reg   <= 1;
+
+	 i_b_reg   <= 0;
+	 maxcount_reg <= 0;
+
       end
       else begin
 	 state     <= state_;
@@ -147,6 +153,7 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
          addr_full <= addr_full_;
 	 o_v_reg   <= o_v_reg_;
 	 i_b_reg   <= i_b_reg_;
+	 maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
       end
    end // always @ (posedge clock)
 
diff --git a/finn-rtllib/swg/swg_template_axilite.v b/finn-rtllib/swg/swg_template_axilite.v
new file mode 100644
index 0000000000000000000000000000000000000000..9479c7f80d7d82b27141dbe5abcce442049237bd
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_axilite.v
@@ -0,0 +1,567 @@
+
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$_axilite #
+(
+    // Users to add parameters here
+
+    // User parameters ends
+    // Do not modify the parameters beyond this line
+
+    // Width of S_AXI data bus
+    parameter integer C_S_AXI_DATA_WIDTH	= 32,
+    // Width of S_AXI address bus
+    parameter integer C_S_AXI_ADDR_WIDTH	= 6
+)
+(
+    // Users to add ports here
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg0,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg1,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg2,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg3,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg4,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg5,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg6,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg7,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg8,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg9,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg10,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg11,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg12,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg13,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg14,
+    output wire [C_S_AXI_DATA_WIDTH-1:0]	cfg_reg15,
+
+    // User ports ends
+    // Do not modify the ports beyond this line
+
+    // Global Clock Signal
+    input wire  S_AXI_ACLK,
+    // Global Reset Signal. This Signal is Active LOW
+    input wire  S_AXI_ARESETN,
+    // Write address (issued by master, acceped by Slave)
+    input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR,
+    // Write channel Protection type. This signal indicates the
+        // privilege and security level of the transaction, and whether
+        // the transaction is a data access or an instruction access.
+    input wire [2 : 0] S_AXI_AWPROT,
+    // Write address valid. This signal indicates that the master signaling
+        // valid write address and control information.
+    input wire  S_AXI_AWVALID,
+    // Write address ready. This signal indicates that the slave is ready
+        // to accept an address and associated control signals.
+    output wire  S_AXI_AWREADY,
+    // Write data (issued by master, acceped by Slave)
+    input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA,
+    // Write strobes. This signal indicates which byte lanes hold
+        // valid data. There is one write strobe bit for each eight
+        // bits of the write data bus.
+    input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB,
+    // Write valid. This signal indicates that valid write
+        // data and strobes are available.
+    input wire  S_AXI_WVALID,
+    // Write ready. This signal indicates that the slave
+        // can accept the write data.
+    output wire  S_AXI_WREADY,
+    // Write response. This signal indicates the status
+        // of the write transaction.
+    output wire [1 : 0] S_AXI_BRESP,
+    // Write response valid. This signal indicates that the channel
+        // is signaling a valid write response.
+    output wire  S_AXI_BVALID,
+    // Response ready. This signal indicates that the master
+        // can accept a write response.
+    input wire  S_AXI_BREADY,
+    // Read address (issued by master, acceped by Slave)
+    input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR,
+    // Protection type. This signal indicates the privilege
+        // and security level of the transaction, and whether the
+        // transaction is a data access or an instruction access.
+    input wire [2 : 0] S_AXI_ARPROT,
+    // Read address valid. This signal indicates that the channel
+        // is signaling valid read address and control information.
+    input wire  S_AXI_ARVALID,
+    // Read address ready. This signal indicates that the slave is
+        // ready to accept an address and associated control signals.
+    output wire  S_AXI_ARREADY,
+    // Read data (issued by slave)
+    output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,
+    // Read response. This signal indicates the status of the
+        // read transfer.
+    output wire [1 : 0] S_AXI_RRESP,
+    // Read valid. This signal indicates that the channel is
+        // signaling the required read data.
+    output wire  S_AXI_RVALID,
+    // Read ready. This signal indicates that the master can
+        // accept the read data and response information.
+    input wire  S_AXI_RREADY
+);
+
+// AXI4LITE signals
+reg [C_S_AXI_ADDR_WIDTH-1 : 0] 	axi_awaddr;
+reg  	axi_awready;
+reg  	axi_wready;
+reg [1 : 0] 	axi_bresp;
+reg  	axi_bvalid;
+reg [C_S_AXI_ADDR_WIDTH-1 : 0] 	axi_araddr;
+reg  	axi_arready;
+reg [C_S_AXI_DATA_WIDTH-1 : 0] 	axi_rdata;
+reg [1 : 0] 	axi_rresp;
+reg  	axi_rvalid;
+
+// Example-specific design signals
+// local parameter for addressing 32 bit / 64 bit C_S_AXI_DATA_WIDTH
+// ADDR_LSB is used for addressing 32/64 bit registers/memories
+// ADDR_LSB = 2 for 32 bits (n downto 2)
+// ADDR_LSB = 3 for 64 bits (n downto 3)
+localparam integer ADDR_LSB = (C_S_AXI_DATA_WIDTH/32) + 1;
+localparam integer OPT_MEM_ADDR_BITS = 3;
+//----------------------------------------------
+//-- Signals for user logic register space example
+//------------------------------------------------
+//-- Number of Slave Registers 16
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg0;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg1;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg2;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg3;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg4;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg5;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg6;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg7;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg8;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg9;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg10;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg11;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg12;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg13;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg14;
+reg [C_S_AXI_DATA_WIDTH-1:0]	slv_reg15;
+wire	 slv_reg_rden;
+wire	 slv_reg_wren;
+reg [C_S_AXI_DATA_WIDTH-1:0]	 reg_data_out;
+integer	 byte_index;
+reg	 aw_en;
+
+// I/O Connections assignments
+
+assign S_AXI_AWREADY	= axi_awready;
+assign S_AXI_WREADY	= axi_wready;
+assign S_AXI_BRESP	= axi_bresp;
+assign S_AXI_BVALID	= axi_bvalid;
+assign S_AXI_ARREADY	= axi_arready;
+assign S_AXI_RDATA	= axi_rdata;
+assign S_AXI_RRESP	= axi_rresp;
+assign S_AXI_RVALID	= axi_rvalid;
+// Implement axi_awready generation
+// axi_awready is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_AWVALID and S_AXI_WVALID are asserted. axi_awready is
+// de-asserted when reset is low.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_awready <= 1'b0;
+        aw_en <= 1'b1;
+    end
+    else
+    begin
+        if (~axi_awready && S_AXI_AWVALID && S_AXI_WVALID && aw_en)
+        begin
+            // slave is ready to accept write address when
+            // there is a valid write address and write data
+            // on the write address and data bus. This design
+            // expects no outstanding transactions.
+            axi_awready <= 1'b1;
+            aw_en <= 1'b0;
+        end
+        else if (S_AXI_BREADY && axi_bvalid)
+            begin
+                aw_en <= 1'b1;
+                axi_awready <= 1'b0;
+            end
+        else
+        begin
+            axi_awready <= 1'b0;
+        end
+    end
+end
+
+// Implement axi_awaddr latching
+// This process is used to latch the address when both
+// S_AXI_AWVALID and S_AXI_WVALID are valid.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_awaddr <= 0;
+    end
+    else
+    begin
+        if (~axi_awready && S_AXI_AWVALID && S_AXI_WVALID && aw_en)
+        begin
+            // Write Address latching
+            axi_awaddr <= S_AXI_AWADDR;
+        end
+    end
+end
+
+// Implement axi_wready generation
+// axi_wready is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_AWVALID and S_AXI_WVALID are asserted. axi_wready is
+// de-asserted when reset is low.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_wready <= 1'b0;
+    end
+    else
+    begin
+        if (~axi_wready && S_AXI_WVALID && S_AXI_AWVALID && aw_en )
+        begin
+            // slave is ready to accept write data when
+            // there is a valid write address and write data
+            // on the write address and data bus. This design
+            // expects no outstanding transactions.
+            axi_wready <= 1'b1;
+        end
+        else
+        begin
+            axi_wready <= 1'b0;
+        end
+    end
+end
+
+// Implement memory mapped register select and write logic generation
+// The write data is accepted and written to memory mapped registers when
+// axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to
+// select byte enables of slave registers while writing.
+// These registers are cleared when reset (active low) is applied.
+// Slave register write enable is asserted when valid address and data are available
+// and the slave is ready to accept the write address and write data.
+assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        slv_reg0 <= 0;
+        slv_reg1 <= 0;
+        slv_reg2 <= 0;
+        slv_reg3 <= 0;
+        slv_reg4 <= 0;
+        slv_reg5 <= 0;
+        slv_reg6 <= 0;
+        slv_reg7 <= 0;
+        slv_reg8 <= 0;
+        slv_reg9 <= 0;
+        slv_reg10 <= 0;
+        slv_reg11 <= 0;
+        slv_reg12 <= 0;
+        slv_reg13 <= 0;
+        slv_reg14 <= 0;
+        slv_reg15 <= 0;
+    end
+    else begin
+    if (slv_reg_wren)
+        begin
+        case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
+            4'h0:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 0
+                slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h1:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 1
+                slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h2:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 2
+                slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h3:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 3
+                slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h4:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 4
+                slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h5:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 5
+                slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h6:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 6
+                slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h7:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 7
+                slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h8:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 8
+                slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'h9:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 9
+                slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hA:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 10
+                slv_reg10[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hB:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 11
+                slv_reg11[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hC:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 12
+                slv_reg12[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hD:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 13
+                slv_reg13[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hE:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 14
+                slv_reg14[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            4'hF:
+            for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
+                if ( S_AXI_WSTRB[byte_index] == 1 ) begin
+                // Respective byte enables are asserted as per write strobes
+                // Slave register 15
+                slv_reg15[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
+                end
+            default : begin
+                        slv_reg0 <= slv_reg0;
+                        slv_reg1 <= slv_reg1;
+                        slv_reg2 <= slv_reg2;
+                        slv_reg3 <= slv_reg3;
+                        slv_reg4 <= slv_reg4;
+                        slv_reg5 <= slv_reg5;
+                        slv_reg6 <= slv_reg6;
+                        slv_reg7 <= slv_reg7;
+                        slv_reg8 <= slv_reg8;
+                        slv_reg9 <= slv_reg9;
+                        slv_reg10 <= slv_reg10;
+                        slv_reg11 <= slv_reg11;
+                        slv_reg12 <= slv_reg12;
+                        slv_reg13 <= slv_reg13;
+                        slv_reg14 <= slv_reg14;
+                        slv_reg15 <= slv_reg15;
+                    end
+        endcase
+        end
+    end
+end
+
+// Implement write response logic generation
+// The write response and response valid signals are asserted by the slave
+// when axi_wready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted.
+// This marks the acceptance of address and indicates the status of
+// write transaction.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_bvalid  <= 0;
+        axi_bresp   <= 2'b0;
+    end
+    else
+    begin
+        if (axi_awready && S_AXI_AWVALID && ~axi_bvalid && axi_wready && S_AXI_WVALID)
+        begin
+            // indicates a valid write response is available
+            axi_bvalid <= 1'b1;
+            axi_bresp  <= 2'b0; // 'OKAY' response
+        end                   // work error responses in future
+        else
+        begin
+            if (S_AXI_BREADY && axi_bvalid)
+            //check if bready is asserted while bvalid is high)
+            //(there is a possibility that bready is always asserted high)
+            begin
+                axi_bvalid <= 1'b0;
+            end
+        end
+    end
+end
+
+// Implement axi_arready generation
+// axi_arready is asserted for one S_AXI_ACLK clock cycle when
+// S_AXI_ARVALID is asserted. axi_awready is
+// de-asserted when reset (active low) is asserted.
+// The read address is also latched when S_AXI_ARVALID is
+// asserted. axi_araddr is reset to zero on reset assertion.
+
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_arready <= 1'b0;
+        axi_araddr  <= 32'b0;
+    end
+    else
+    begin
+        if (~axi_arready && S_AXI_ARVALID)
+        begin
+            // indicates that the slave has acceped the valid read address
+            axi_arready <= 1'b1;
+            // Read address latching
+            axi_araddr  <= S_AXI_ARADDR;
+        end
+        else
+        begin
+            axi_arready <= 1'b0;
+        end
+    end
+end
+
+// Implement axi_arvalid generation
+// axi_rvalid is asserted for one S_AXI_ACLK clock cycle when both
+// S_AXI_ARVALID and axi_arready are asserted. The slave registers
+// data are available on the axi_rdata bus at this instance. The
+// assertion of axi_rvalid marks the validity of read data on the
+// bus and axi_rresp indicates the status of read transaction.axi_rvalid
+// is deasserted on reset (active low). axi_rresp and axi_rdata are
+// cleared to zero on reset (active low).
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_rvalid <= 0;
+        axi_rresp  <= 0;
+    end
+    else
+    begin
+        if (axi_arready && S_AXI_ARVALID && ~axi_rvalid)
+        begin
+            // Valid read data is available at the read data bus
+            axi_rvalid <= 1'b1;
+            axi_rresp  <= 2'b0; // 'OKAY' response
+        end
+        else if (axi_rvalid && S_AXI_RREADY)
+        begin
+            // Read data is accepted by the master
+            axi_rvalid <= 1'b0;
+        end
+    end
+end
+
+// Implement memory mapped register select and read logic generation
+// Slave register read enable is asserted when valid address is available
+// and the slave is ready to accept the read address.
+assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;
+always @(*)
+begin
+        // Address decoding for reading registers
+        case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
+        4'h0   : reg_data_out <= slv_reg0;
+        4'h1   : reg_data_out <= slv_reg1;
+        4'h2   : reg_data_out <= slv_reg2;
+        4'h3   : reg_data_out <= slv_reg3;
+        4'h4   : reg_data_out <= slv_reg4;
+        4'h5   : reg_data_out <= slv_reg5;
+        4'h6   : reg_data_out <= slv_reg6;
+        4'h7   : reg_data_out <= slv_reg7;
+        4'h8   : reg_data_out <= slv_reg8;
+        4'h9   : reg_data_out <= slv_reg9;
+        4'hA   : reg_data_out <= slv_reg10;
+        4'hB   : reg_data_out <= slv_reg11;
+        4'hC   : reg_data_out <= slv_reg12;
+        4'hD   : reg_data_out <= slv_reg13;
+        4'hE   : reg_data_out <= slv_reg14;
+        4'hF   : reg_data_out <= slv_reg15;
+        default : reg_data_out <= 0;
+        endcase
+end
+
+// Output register or memory read data
+always @( posedge S_AXI_ACLK )
+begin
+    if ( S_AXI_ARESETN == 1'b0 )
+    begin
+        axi_rdata  <= 0;
+    end
+    else
+    begin
+        // When there is a valid read address (S_AXI_ARVALID) with
+        // acceptance of read address by the slave (axi_arready),
+        // output the read dada
+        if (slv_reg_rden)
+        begin
+            axi_rdata <= reg_data_out;     // register read data
+        end
+    end
+end
+
+// Add user logic here
+assign	cfg_reg0 = slv_reg0;
+assign	cfg_reg1 = slv_reg1;
+assign	cfg_reg2 = slv_reg2;
+assign	cfg_reg3 = slv_reg3;
+assign	cfg_reg4 = slv_reg4;
+assign	cfg_reg5 = slv_reg5;
+assign	cfg_reg6 = slv_reg6;
+assign	cfg_reg7 = slv_reg7;
+assign	cfg_reg8 = slv_reg8;
+assign	cfg_reg9 = slv_reg9;
+assign	cfg_reg10 = slv_reg10;
+assign	cfg_reg11 = slv_reg11;
+assign	cfg_reg12 = slv_reg12;
+assign	cfg_reg13 = slv_reg13;
+assign	cfg_reg14 = slv_reg14;
+assign	cfg_reg15 = slv_reg15;
+// User logic ends
+
+endmodule
diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
new file mode 100644
index 0000000000000000000000000000000000000000..06e65e911100dd7d3d8879b014a6d59713eb9bbd
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -0,0 +1,353 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+module $TOP_MODULE_NAME$_controller #(
+    int unsigned  LOOP_H_ITERATIONS    = $LOOP_H_ITERATIONS$,
+    int unsigned  LOOP_W_ITERATIONS    = $LOOP_W_ITERATIONS$,
+    int unsigned  LOOP_KH_ITERATIONS   = $LOOP_KH_ITERATIONS$,
+    int unsigned  LOOP_KW_ITERATIONS   = $LOOP_KW_ITERATIONS$,
+    int unsigned  LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$,
+
+    int unsigned  INCR_BITWIDTH = $INCR_BITWIDTH$,
+
+    bit IS_DEPTHWISE = $IS_DEPTHWISE$
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  advance,
+    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
+    output  logic [INCR_BITWIDTH-1:0]  tail_incr
+);
+
+    // state and counters
+    typedef enum logic [2:0] {
+        STATE_START,
+        STATE_LOOP_SIMD,
+        STATE_LOOP_KW,
+        STATE_LOOP_KH,
+        STATE_LOOP_W,
+        STATE_LOOP_H
+    }  state_e;
+    state_e  State = $INNERMOST_STATE$;
+    state_e  state_next;
+
+    logic signed [$clog2(LOOP_H_ITERATIONS   +2)+1-1:0]  Counter_loop_h    = LOOP_H_ITERATIONS;
+    logic signed [$clog2(LOOP_W_ITERATIONS   +2)+1-1:0]  Counter_loop_w    = LOOP_W_ITERATIONS;
+    logic signed [$clog2(LOOP_KH_ITERATIONS  +2)+1-1:0]  Counter_loop_kh   = LOOP_KH_ITERATIONS;
+    logic signed [$clog2(LOOP_KW_ITERATIONS  +2)+1-1:0]  Counter_loop_kw   = LOOP_KW_ITERATIONS;
+    logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0]  Counter_loop_simd = LOOP_SIMD_ITERATIONS;
+
+    // combinational logic for addr_incr generation
+    always_comb begin : blkHead
+        unique case (State)
+            0 : addr_incr = 0;
+            1 : addr_incr = $HEAD_INCR_SIMD$;
+            2 : addr_incr = $HEAD_INCR_KW$;
+            3 : addr_incr = $HEAD_INCR_KH$;
+            4 : addr_incr = $HEAD_INCR_W$;
+            5 : addr_incr = $HEAD_INCR_H$;
+        endcase
+    end
+
+    // combinational logic for tail_incr generation
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    assign tail_incr =
+        tail_incr_inner_condition? 1 :
+        Counter_loop_w >= 0?       $TAIL_INCR_W$ :
+        Counter_loop_h >= 0?       $TAIL_INCR_H$ :
+        /* else */                 $TAIL_INCR_LAST$;
+
+    // combinational next state logic
+    always_comb begin : blkState
+        state_next = State;
+        if(State != $INNERMOST_STATE$)  state_next = $INNERMOST_STATE$;
+        else begin
+            if(Counter_loop_simd < 0) begin
+                state_next =
+                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
+                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
+                    /* else */              STATE_START;
+            end
+        end
+    end : blkState
+
+    // sequential logic
+    always_ff @ (posedge clk) begin
+        if(!rst_n) begin
+            State <= $INNERMOST_STATE$;
+            Counter_loop_h    <= LOOP_H_ITERATIONS;
+            Counter_loop_w    <= LOOP_W_ITERATIONS;
+            Counter_loop_kh   <= LOOP_KH_ITERATIONS;
+            Counter_loop_kw   <= LOOP_KW_ITERATIONS;
+            Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+        end
+        else if(advance) begin
+            State <= state_next;
+            if (State == $INNERMOST_STATE$) begin
+                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
+                else begin
+                    Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
+                    else begin
+                        Counter_loop_kw <= LOOP_KW_ITERATIONS;
+                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
+                        else begin
+                            Counter_loop_kh <= LOOP_KH_ITERATIONS;
+                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
+                            else begin
+                                Counter_loop_w <= LOOP_W_ITERATIONS;
+                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
+                                else  Counter_loop_h <= LOOP_H_ITERATIONS;
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+endmodule :  $TOP_MODULE_NAME$_controller
+
+module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+    int unsigned  WIDTH,
+    int unsigned  DEPTH
+)(
+    input   logic  clk,
+
+    input   logic  write_enable,
+    input   logic [$clog2(DEPTH)-1:0] write_addr,
+    input   logic [WIDTH-1:0]  data_in,
+
+    input   logic  read_enable,
+    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
+    output  logic [WIDTH-1:0]  data_out
+);
+
+    $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
+    logic [WIDTH-1:0]  Out = 'x;
+    always_ff @(posedge clk) begin
+        if (read_enable)  Out <= Ram[read_addr];
+        if (write_enable) Ram[write_addr] <= data_in;
+    end
+    assign  data_out = Out;
+
+endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
+
+module $TOP_MODULE_NAME$_impl #(
+    int  BIT_WIDTH,
+    int  SIMD,
+    int  MMV_IN,
+    int  MMV_OUT,
+    int  LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int  LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int  BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int  ELEM_PER_WINDOW = $ELEM_PER_WINDOW$,
+    int  INCR_BITWIDTH = $INCR_BITWIDTH$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA
+);
+    // derived constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+   // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0]  window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0]  window_buffer_out;
+    uwire  window_buffer_write_enable;
+    uwire  window_buffer_read_enable;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
+    $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+        .WIDTH(BUF_IN_WIDTH),
+        .DEPTH(BUF_ELEM_TOTAL)
+    ) window_buffer_inst (
+        .clk(ap_clk),
+
+        .write_enable(window_buffer_write_enable),
+        .write_addr(window_buffer_write_addr),
+        .data_in(window_buffer_in),
+
+        .read_enable(window_buffer_read_enable),
+        .read_addr(window_buffer_read_addr),
+        .data_out(window_buffer_out)
+    );
+
+    //controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    $TOP_MODULE_NAME$_controller controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr)
+    );
+
+    // Counters/address registers
+    // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg,
+    // so we can use automatic sign extension and simplify calculations w/ signed increment.
+    // Alternatively, we could manually sign-extend and shave off a bit here or there.
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = 0;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  Position_in_window = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)+1  -1:0]  Window_buffer_read_addr_reg = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
+
+    // Control signals/registers
+    logic  Write_cmd    = 0;
+    logic  Writing_done = 0;
+    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
+    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;
+
+    logic  Fetching_done = 0;
+    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
+
+    uwire  reading_done = Newest_buffered_elem == LAST_READ_ELEM;
+    uwire  read_cmd =
+        !reading_done && ( // if there is still an input element left to read
+            Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID;
+
+    //assign buffer control
+    assign  window_buffer_write_addr = Window_buffer_write_addr_reg;
+    assign  window_buffer_read_addr = Window_buffer_read_addr_reg;
+    assign  window_buffer_write_enable = read_ok;
+    assign  window_buffer_read_enable = fetch_cmd;
+    assign  advance_controller = fetch_cmd;
+
+    //assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    //main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= 0;
+            First_elem_next_window <= 0;
+            Position_in_window <= 0;
+            Window_buffer_read_addr_reg <= 0;
+            Window_buffer_write_addr_reg <= 0;
+            Fetching_done <= 0;
+            Write_cmd <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1;
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                if (Newest_buffered_elem == LAST_READ_ELEM-1) begin
+                    Window_buffer_write_addr_reg <= 0;
+                end
+                //check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin
+                    //start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                    Fetching_done <= 0;
+                end
+            end
+
+            if (fetch_cmd) begin
+                //count up to track which element index is about to be read from the buffer, and where it is located within the buffer
+                //use increment value calculated by controller
+
+                // absolute buffer address wrap-around
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0]  ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr);
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0]  ra_correct =
+                    (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL :
+                    (ra <               0)?  BUF_ELEM_TOTAL : 0;
+                Window_buffer_read_addr_reg <= ra + ra_correct;
+
+                //keep track where we are within a window
+                Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0;
+
+                //update first element of next window to allow buffer overwrite up until that point
+                if (Position_in_window == 0)
+                    First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == LAST_WRITE_ELEM)
+                    Fetching_done <= 1;
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+
+                // determine if prefetched data will be outstanding in the next cycle
+                // if we fetch in this cycle -> yes
+                // if we do not fetch nor write -> do not change
+                // if we do not fetch but write successfully-> clear outstanding data
+                Write_cmd <= fetch_cmd;
+            end
+
+            if (write_ok)
+                Write_cmd <= fetch_cmd;
+
+            if (write_ok && Fetching_done) begin
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin
+                    //start processing of next FM if reading is done already, or completes in the same cycle
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Fetching_done <= 0;
+                end else
+                    Writing_done <= 1;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_default_dynamic.sv b/finn-rtllib/swg/swg_template_default_dynamic.sv
new file mode 100644
index 0000000000000000000000000000000000000000..eb53978b580a4753bbea6c8478f35912deb812b4
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_default_dynamic.sv
@@ -0,0 +1,416 @@
+module $TOP_MODULE_NAME$_controller #(
+    int unsigned  CNTR_BITWIDTH,
+    int unsigned  INCR_BITWIDTH,
+
+    bit IS_DEPTHWISE = $IS_DEPTHWISE$
+)(
+    input   logic  clk,
+    input   logic  rst_n,
+
+    input   logic  advance,
+    output  logic [INCR_BITWIDTH-1:0]  addr_incr,
+    output  logic [INCR_BITWIDTH-1:0]  tail_incr,
+
+    input logic                     cfg_valid,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_simd,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kw,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kh,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_w,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_simd,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kw,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kh,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last
+);
+
+    // (dynamic) configuration registers
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_simd      = $LOOP_SIMD_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kw        = $LOOP_KW_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kh        = $LOOP_KH_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_w         = $LOOP_W_ITERATIONS$;
+    logic [CNTR_BITWIDTH-1:0] Cfg_cntr_h         = $LOOP_H_ITERATIONS$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_simd = $HEAD_INCR_SIMD$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_kw   = $HEAD_INCR_KW$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_kh   = $HEAD_INCR_KH$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_w    = $HEAD_INCR_W$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_head_h    = $HEAD_INCR_H$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_w    = $TAIL_INCR_W$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_h    = $TAIL_INCR_H$;
+    logic [INCR_BITWIDTH-1:0] Cfg_incr_tail_last = $TAIL_INCR_LAST$;
+
+    // configuration reset/set logic
+    always_ff @ (posedge clk) begin
+        if(cfg_valid) begin
+            Cfg_cntr_simd      <= cfg_cntr_simd;
+            Cfg_cntr_kw        <= cfg_cntr_kw;
+            Cfg_cntr_kh        <= cfg_cntr_kh;
+            Cfg_cntr_w         <= cfg_cntr_w;
+            Cfg_cntr_h         <= cfg_cntr_h;
+            Cfg_incr_head_simd <= cfg_incr_head_simd;
+            Cfg_incr_head_kw   <= cfg_incr_head_kw;
+            Cfg_incr_head_kh   <= cfg_incr_head_kh;
+            Cfg_incr_head_w    <= cfg_incr_head_w;
+            Cfg_incr_head_h    <= cfg_incr_head_h;
+            Cfg_incr_tail_w    <= cfg_incr_tail_w;
+            Cfg_incr_tail_h    <= cfg_incr_tail_h;
+            Cfg_incr_tail_last <= cfg_incr_tail_last;
+        end
+    end
+
+    // state and counters
+    typedef enum logic [2:0] {
+        STATE_START,
+        STATE_LOOP_SIMD,
+        STATE_LOOP_KW,
+        STATE_LOOP_KH,
+        STATE_LOOP_W,
+        STATE_LOOP_H
+    }  state_e;
+    state_e  State = $INNERMOST_STATE$;
+    state_e  state_next;
+
+    logic signed [$clog2($LOOP_H_ITERATIONS$   +2)+1-1:0]  Counter_loop_h    = $LOOP_H_ITERATIONS$;
+    logic signed [$clog2($LOOP_W_ITERATIONS$   +2)+1-1:0]  Counter_loop_w    = $LOOP_W_ITERATIONS$;
+    logic signed [$clog2($LOOP_KH_ITERATIONS$  +2)+1-1:0]  Counter_loop_kh   = $LOOP_KH_ITERATIONS$;
+    logic signed [$clog2($LOOP_KW_ITERATIONS$  +2)+1-1:0]  Counter_loop_kw   = $LOOP_KW_ITERATIONS$;
+    logic signed [$clog2($LOOP_SIMD_ITERATIONS$+2)+1-1:0]  Counter_loop_simd = $LOOP_SIMD_ITERATIONS$;
+
+    // combinational logic for addr_incr generation
+    always_comb begin : blkHead
+        unique case (State)
+            0 : addr_incr = 0;
+            1 : addr_incr = Cfg_incr_head_simd;
+            2 : addr_incr = Cfg_incr_head_kw;
+            3 : addr_incr = Cfg_incr_head_kh;
+            4 : addr_incr = Cfg_incr_head_w;
+            5 : addr_incr = Cfg_incr_head_h;
+        endcase
+    end
+
+    // combinational logic for tail_incr generation
+    uwire  tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+    assign tail_incr =
+        tail_incr_inner_condition? 1 :
+        Counter_loop_w >= 0?       Cfg_incr_tail_w :
+        Counter_loop_h >= 0?       Cfg_incr_tail_h :
+        /* else */                 Cfg_incr_tail_last;
+
+    // combinational next state logic
+    always_comb begin : blkState
+        state_next = State;
+        if(State != $INNERMOST_STATE$)  state_next = $INNERMOST_STATE$;
+        else begin
+            if(Counter_loop_simd < 0) begin
+                state_next =
+                    (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+                    (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+                    (Counter_loop_w  >= 0)? STATE_LOOP_W :
+                    (Counter_loop_h  >= 0)? STATE_LOOP_H :
+                    /* else */              STATE_START;
+            end
+        end
+    end : blkState
+
+    // sequential logic
+    always_ff @ (posedge clk) begin
+        if(!rst_n) begin
+            State <= $INNERMOST_STATE$;
+            Counter_loop_h    <= Cfg_cntr_h;
+            Counter_loop_w    <= Cfg_cntr_w;
+            Counter_loop_kh   <= Cfg_cntr_kh;
+            Counter_loop_kw   <= Cfg_cntr_kw;
+            Counter_loop_simd <= Cfg_cntr_simd;
+        end
+        else if(advance) begin
+            State <= state_next;
+            if (State == $INNERMOST_STATE$) begin
+                if(Counter_loop_simd >= 0)  Counter_loop_simd <= Counter_loop_simd-1;
+                else begin
+                    Counter_loop_simd <= Cfg_cntr_simd;
+                    if(Counter_loop_kw >= 0)  Counter_loop_kw <= Counter_loop_kw-1;
+                    else begin
+                        Counter_loop_kw <= Cfg_cntr_kw;
+                        if(Counter_loop_kh >= 0)  Counter_loop_kh <= Counter_loop_kh-1;
+                        else begin
+                            Counter_loop_kh <= Cfg_cntr_kh;
+                            if(Counter_loop_w >= 0)  Counter_loop_w <= Counter_loop_w-1;
+                            else begin
+                                Counter_loop_w <= Cfg_cntr_w;
+                                if(Counter_loop_h >= 0)  Counter_loop_h <= Counter_loop_h-1;
+                                else  Counter_loop_h <= Cfg_cntr_h;
+                            end
+                        end
+                    end
+                end
+            end
+        end
+    end
+
+endmodule :  $TOP_MODULE_NAME$_controller
+
+module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+    int unsigned  WIDTH,
+    int unsigned  DEPTH
+)(
+    input   logic  clk,
+
+    input   logic  write_enable,
+    input   logic [$clog2(DEPTH)-1:0] write_addr,
+    input   logic [WIDTH-1:0]  data_in,
+
+    input   logic  read_enable,
+    input   logic [$clog2(DEPTH)-1:0]  read_addr, // absolute (!) read address of cyclic buffer
+    output  logic [WIDTH-1:0]  data_out
+);
+
+    $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
+    logic [WIDTH-1:0]  Out = 'x;
+    always_ff @(posedge clk) begin
+        if (read_enable)  Out <= Ram[read_addr];
+        if (write_enable) Ram[write_addr] <= data_in;
+    end
+    assign  data_out = Out;
+
+endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
+
+module $TOP_MODULE_NAME$_impl #(
+    int  BIT_WIDTH,
+    int  SIMD,
+    int  MMV_IN,
+    int  MMV_OUT,
+    int unsigned  CNTR_BITWIDTH,
+    int unsigned  INCR_BITWIDTH,
+
+    int  LAST_READ_ELEM = $LAST_READ_ELEM$,
+    int  LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+    int  BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+    int  ELEM_PER_WINDOW = $ELEM_PER_WINDOW$
+)(
+    input   logic  ap_clk,
+    input   logic  ap_rst_n,
+
+    input   logic  in0_V_V_TVALID,
+    output  logic  in0_V_V_TREADY,
+    input   logic [BIT_WIDTH * SIMD * MMV_IN-1:0]  in0_V_V_TDATA,
+
+    output  logic  out_V_V_TVALID,
+    input   logic  out_V_V_TREADY,
+    output  logic [BIT_WIDTH * SIMD * MMV_OUT-1:0]  out_V_V_TDATA,
+
+    input logic                     cfg_valid,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_simd,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kw,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_kh,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_w,
+    input logic [CNTR_BITWIDTH-1:0] cfg_cntr_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_simd,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kw,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_kh,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_head_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_w,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_h,
+    input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last,
+    input logic [31:0]              cfg_last_read,
+    input logic [31:0]              cfg_last_write
+);
+    // derived constants
+    localparam int unsigned  BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+    localparam int unsigned  BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+    localparam int unsigned  BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+    // (dynamic) configuration registers
+    logic [31:0] Cfg_last_read  = LAST_READ_ELEM;
+    logic [31:0] Cfg_last_write = LAST_WRITE_ELEM;
+
+    // configuration reset/set logic
+    always_ff @ (posedge ap_clk) begin
+        if(cfg_valid) begin
+            Cfg_last_read  <= cfg_last_read;
+            Cfg_last_write <= cfg_last_write;
+        end
+    end
+
+   // main buffer instantiation
+    uwire [BUF_IN_WIDTH -1:0]  window_buffer_in;
+    uwire [BUF_OUT_WIDTH-1:0]  window_buffer_out;
+    uwire  window_buffer_write_enable;
+    uwire  window_buffer_read_enable;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_write_addr;
+    uwire [$clog2(BUF_ELEM_TOTAL)-1:0]  window_buffer_read_addr;
+    $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+        .WIDTH(BUF_IN_WIDTH),
+        .DEPTH(BUF_ELEM_TOTAL)
+    ) window_buffer_inst (
+        .clk(ap_clk),
+
+        .write_enable(window_buffer_write_enable),
+        .write_addr(window_buffer_write_addr),
+        .data_in(window_buffer_in),
+
+        .read_enable(window_buffer_read_enable),
+        .read_addr(window_buffer_read_addr),
+        .data_out(window_buffer_out)
+    );
+
+    //controller instantiation
+    uwire  advance_controller;
+    uwire signed [INCR_BITWIDTH-1:0]  addr_incr;
+    uwire        [INCR_BITWIDTH-1:0]  tail_incr;
+    $TOP_MODULE_NAME$_controller #(
+        .CNTR_BITWIDTH(CNTR_BITWIDTH),
+        .INCR_BITWIDTH(INCR_BITWIDTH)
+    ) controller_inst (
+        .clk(ap_clk),
+        .rst_n(ap_rst_n),
+        .advance(advance_controller),
+        .addr_incr(addr_incr),
+        .tail_incr(tail_incr),
+
+        .cfg_valid(cfg_valid),
+        .cfg_cntr_simd(cfg_cntr_simd),
+        .cfg_cntr_kw(cfg_cntr_kw),
+        .cfg_cntr_kh(cfg_cntr_kh),
+        .cfg_cntr_w(cfg_cntr_w),
+        .cfg_cntr_h(cfg_cntr_h),
+        .cfg_incr_head_simd(cfg_incr_head_simd),
+        .cfg_incr_head_kw(cfg_incr_head_kw),
+        .cfg_incr_head_kh(cfg_incr_head_kh),
+        .cfg_incr_head_w(cfg_incr_head_w),
+        .cfg_incr_head_h(cfg_incr_head_h),
+        .cfg_incr_tail_w(cfg_incr_tail_w),
+        .cfg_incr_tail_h(cfg_incr_tail_h),
+        .cfg_incr_tail_last(cfg_incr_tail_last)
+    );
+
+    // Counters/address registers
+    // Add a sign bit even to (most) unsigned counters and Window_buffer_read_addr_reg,
+    // so we can use automatic sign extension and simplify calculations w/ signed increment.
+    // Alternatively, we could manually sign-extend and shave off a bit here or there.
+    logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0]  Newest_buffered_elem = -1;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  Current_elem = 0;
+    logic        [$clog2(LAST_READ_ELEM+1)+1-1:0]  First_elem_next_window = 0;
+    logic        [$clog2(ELEM_PER_WINDOW)   -1:0]  Position_in_window = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)+1  -1:0]  Window_buffer_read_addr_reg = 0;
+    logic        [$clog2(BUF_ELEM_TOTAL)-1:0]      Window_buffer_write_addr_reg = 0;
+
+    // Control signals/registers
+    logic  Write_cmd    = 0;
+    logic  Writing_done = 0;
+    uwire  write_ok      = Write_cmd &&  out_V_V_TREADY;
+    uwire  write_blocked = Write_cmd && !out_V_V_TREADY;
+
+    logic  Fetching_done = 0;
+    uwire  fetch_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !write_blocked && !Fetching_done;
+
+    uwire  reading_done = Newest_buffered_elem == Cfg_last_read;
+    uwire  read_cmd =
+        !reading_done && ( // if there is still an input element left to read
+            Fetching_done || ( // if fetching is done (e.g. for skipped rows at FM end due to stride)
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(First_elem_next_window) &&
+                $signed(((Newest_buffered_elem - (BUF_ELEM_TOTAL - 1)))) < $signed(Current_elem)
+            ) // (over-)write to buffer if oldest buffered element will no longer be needed
+        );
+    uwire  read_ok      = read_cmd && in0_V_V_TVALID;
+
+    //assign buffer control
+    assign  window_buffer_write_addr = Window_buffer_write_addr_reg;
+    assign  window_buffer_read_addr = Window_buffer_read_addr_reg;
+    assign  window_buffer_write_enable = read_ok;
+    assign  window_buffer_read_enable = fetch_cmd;
+    assign  advance_controller = fetch_cmd;
+
+    //assign I/O ports
+    assign  window_buffer_in = in0_V_V_TDATA;
+    assign  out_V_V_TDATA = window_buffer_out;
+    assign  in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+    assign  out_V_V_TVALID = ap_rst_n && Write_cmd; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+    //main process for advancing counters
+    always_ff @(posedge ap_clk) begin
+        if(!ap_rst_n) begin
+            Newest_buffered_elem <= -1;
+            Current_elem <= 0;
+            First_elem_next_window <= 0;
+            Position_in_window <= 0;
+            Window_buffer_read_addr_reg <= 0;
+            Window_buffer_write_addr_reg <= 0;
+            Fetching_done <= 0;
+            Write_cmd <= 0;
+            Writing_done <= 0;
+        end
+        else begin
+            if (read_ok) begin
+                Window_buffer_write_addr_reg <= (Window_buffer_write_addr_reg == BUF_ELEM_TOTAL-1)? 0 : Window_buffer_write_addr_reg + 1;
+                Newest_buffered_elem <= Newest_buffered_elem+1;
+
+                if (Newest_buffered_elem == Cfg_last_read-1) begin
+                    Window_buffer_write_addr_reg <= 0;
+                end
+                //check if this is the last read cycle (reading_done will be true afterwards)
+                if ((Newest_buffered_elem == Cfg_last_read-1) && Writing_done) begin
+                    //start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+                    //todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Writing_done <= 0;
+                    Fetching_done <= 0;
+                end
+            end
+
+            if (fetch_cmd) begin
+                //count up to track which element index is about to be read from the buffer, and where it is located within the buffer
+                //use increment value calculated by controller
+
+                // absolute buffer address wrap-around
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL)+1:0]  ra = $signed(Window_buffer_read_addr_reg) + $signed(addr_incr);
+                automatic logic signed [$clog2(BUF_ELEM_TOTAL+1):0]  ra_correct =
+                    (ra >= BUF_ELEM_TOTAL)? -BUF_ELEM_TOTAL :
+                    (ra <               0)?  BUF_ELEM_TOTAL : 0;
+                Window_buffer_read_addr_reg <= ra + ra_correct;
+
+                //keep track where we are within a window
+                Position_in_window <= (Position_in_window != ELEM_PER_WINDOW - 1)? Position_in_window+1 : 0;
+
+                //update first element of next window to allow buffer overwrite up until that point
+                if (Position_in_window == 0)
+                    First_elem_next_window <= First_elem_next_window + tail_incr;
+
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (Current_elem == Cfg_last_write)
+                    Fetching_done <= 1;
+                else
+                    Current_elem <= $signed(Current_elem) + addr_incr;
+
+                // determine if prefetched data will be outstanding in the next cycle
+                // if we fetch in this cycle -> yes
+                // if we do not fetch nor write -> do not change
+                // if we do not fetch but write successfully-> clear outstanding data
+                Write_cmd <= fetch_cmd;
+            end
+
+            if (write_ok)
+                Write_cmd <= fetch_cmd;
+
+            if (write_ok && Fetching_done) begin
+                //check if this is the last write cycle (Writing_done will be true afterwards)
+                if (reading_done || (read_ok && (Newest_buffered_elem == Cfg_last_read - 1))) begin
+                    //start processing of next FM if reading is done already, or completes in the same cycle
+                    Newest_buffered_elem <= -1;
+                    Current_elem <= 0;
+                    Window_buffer_read_addr_reg <= 0;
+                    First_elem_next_window <= 0;
+                    Fetching_done <= 0;
+                end else
+                    Writing_done <= 1;
+            end
+        end
+    end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v
new file mode 100644
index 0000000000000000000000000000000000000000..0cc3579a255fddaf1a470d440b9e8ac245abe486
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_wrapper.v
@@ -0,0 +1,75 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$ (
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+input  ap_clk,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
+input  ap_rst_n,
+input  [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+input  in0_V_TVALID,
+output in0_V_TREADY,
+output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+output out_V_TVALID,
+input  out_V_TREADY
+);
+
+// top-level parameters (set via code-generation)
+parameter BIT_WIDTH = $BIT_WIDTH$;
+parameter SIMD = $SIMD$;
+parameter MMV_IN = $MMV_IN$;
+parameter MMV_OUT = $MMV_OUT$;
+
+// derived constants
+parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+$TOP_MODULE_NAME$_impl
+#(
+    .BIT_WIDTH(BIT_WIDTH),
+    .SIMD(SIMD),
+    .MMV_IN(MMV_IN),
+    .MMV_OUT(MMV_OUT)
+)
+impl
+(
+    .ap_clk(ap_clk),
+    .ap_rst_n(ap_rst_n),
+    .in0_V_V_TDATA(in0_V_TDATA),
+    .in0_V_V_TVALID(in0_V_TVALID),
+    .in0_V_V_TREADY(in0_V_TREADY),
+    .out_V_V_TDATA(out_V_TDATA),
+    .out_V_V_TVALID(out_V_TVALID),
+    .out_V_V_TREADY(out_V_TREADY)
+);
+
+endmodule //TOP_MODULE_NAME
diff --git a/finn-rtllib/swg/swg_template_wrapper_dynamic.v b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
new file mode 100644
index 0000000000000000000000000000000000000000..ca870ace11edcf097645bc12b0486ffbb83b0ea4
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
@@ -0,0 +1,154 @@
+`timescale 1 ns / 1 ps
+
+module $TOP_MODULE_NAME$ #(
+    // top-level parameters (set via code-generation)
+    parameter BIT_WIDTH = $BIT_WIDTH$,
+    parameter SIMD = $SIMD$,
+    parameter MMV_IN = $MMV_IN$,
+    parameter MMV_OUT = $MMV_OUT$,
+
+    parameter CNTR_BITWIDTH = $CNTR_BITWIDTH$,
+    parameter INCR_BITWIDTH = $INCR_BITWIDTH$,
+
+    // derived constants
+    parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN,
+    parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT,
+
+    parameter integer C_s_axilite_DATA_WIDTH	= 32,
+    parameter integer C_s_axilite_ADDR_WIDTH	= 6
+)
+(
+    (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+    input  ap_clk,
+    (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+    input  ap_rst_n,
+    input  [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+    input  in0_V_TVALID,
+    output in0_V_TREADY,
+    output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+    output out_V_TVALID,
+    input  out_V_TREADY,
+
+    // Ports of Axi Slave Bus Interface s_axilite
+    input  [C_s_axilite_ADDR_WIDTH-1 : 0] s_axilite_awaddr,
+    input  [2 : 0] s_axilite_awprot,
+    input  s_axilite_awvalid,
+    output s_axilite_awready,
+    input  [C_s_axilite_DATA_WIDTH-1 : 0] s_axilite_wdata,
+    input  [(C_s_axilite_DATA_WIDTH/8)-1 : 0] s_axilite_wstrb,
+    input  s_axilite_wvalid,
+    output s_axilite_wready,
+    output [1 : 0] s_axilite_bresp,
+    output s_axilite_bvalid,
+    input  s_axilite_bready,
+    input  [C_s_axilite_ADDR_WIDTH-1 : 0] s_axilite_araddr,
+    input  [2 : 0] s_axilite_arprot,
+    input  s_axilite_arvalid,
+    output s_axilite_arready,
+    output [C_s_axilite_DATA_WIDTH-1 : 0] s_axilite_rdata,
+    output [1 : 0] s_axilite_rresp,
+    output s_axilite_rvalid,
+    input  s_axilite_rready
+);
+
+wire                     cfg_valid;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_simd;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_kw;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_kh;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_w;
+wire [CNTR_BITWIDTH-1:0] cfg_cntr_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_simd;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_kw;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_kh;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_w;
+wire [INCR_BITWIDTH-1:0] cfg_incr_head_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_w;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_h;
+wire [INCR_BITWIDTH-1:0] cfg_incr_tail_last;
+wire [31:0]              cfg_last_read;
+wire [31:0]              cfg_last_write;
+
+// Instantiation of Axi Bus Interface s_axilite
+$TOP_MODULE_NAME$_axilite # (
+    .C_S_AXI_DATA_WIDTH(C_s_axilite_DATA_WIDTH),
+    .C_S_AXI_ADDR_WIDTH(C_s_axilite_ADDR_WIDTH)
+) axilite_cfg_inst (
+    .S_AXI_ACLK(ap_clk),
+    .S_AXI_ARESETN(ap_rst_n),
+    .S_AXI_AWADDR(s_axilite_awaddr),
+    .S_AXI_AWPROT(s_axilite_awprot),
+    .S_AXI_AWVALID(s_axilite_awvalid),
+    .S_AXI_AWREADY(s_axilite_awready),
+    .S_AXI_WDATA(s_axilite_wdata),
+    .S_AXI_WSTRB(s_axilite_wstrb),
+    .S_AXI_WVALID(s_axilite_wvalid),
+    .S_AXI_WREADY(s_axilite_wready),
+    .S_AXI_BRESP(s_axilite_bresp),
+    .S_AXI_BVALID(s_axilite_bvalid),
+    .S_AXI_BREADY(s_axilite_bready),
+    .S_AXI_ARADDR(s_axilite_araddr),
+    .S_AXI_ARPROT(s_axilite_arprot),
+    .S_AXI_ARVALID(s_axilite_arvalid),
+    .S_AXI_ARREADY(s_axilite_arready),
+    .S_AXI_RDATA(s_axilite_rdata),
+    .S_AXI_RRESP(s_axilite_rresp),
+    .S_AXI_RVALID(s_axilite_rvalid),
+    .S_AXI_RREADY(s_axilite_rready),
+
+    .cfg_reg0(cfg_valid),
+    .cfg_reg1(cfg_cntr_simd),
+    .cfg_reg2(cfg_cntr_kw),
+    .cfg_reg3(cfg_cntr_kh),
+    .cfg_reg4(cfg_cntr_w),
+    .cfg_reg5(cfg_cntr_h),
+    .cfg_reg6(cfg_incr_head_simd),
+    .cfg_reg7(cfg_incr_head_kw),
+    .cfg_reg8(cfg_incr_head_kh),
+    .cfg_reg9(cfg_incr_head_w),
+    .cfg_reg10(cfg_incr_head_h),
+    .cfg_reg11(cfg_incr_tail_w),
+    .cfg_reg12(cfg_incr_tail_h),
+    .cfg_reg13(cfg_incr_tail_last),
+    .cfg_reg14(cfg_last_read),
+    .cfg_reg15(cfg_last_write)
+);
+
+$TOP_MODULE_NAME$_impl
+#(
+    .BIT_WIDTH(BIT_WIDTH),
+    .SIMD(SIMD),
+    .MMV_IN(MMV_IN),
+    .MMV_OUT(MMV_OUT),
+    .CNTR_BITWIDTH(CNTR_BITWIDTH),
+    .INCR_BITWIDTH(INCR_BITWIDTH)
+)
+impl
+(
+    .ap_clk(ap_clk),
+    .ap_rst_n(ap_rst_n),
+    .in0_V_V_TDATA(in0_V_TDATA),
+    .in0_V_V_TVALID(in0_V_TVALID),
+    .in0_V_V_TREADY(in0_V_TREADY),
+    .out_V_V_TDATA(out_V_TDATA),
+    .out_V_V_TVALID(out_V_TVALID),
+    .out_V_V_TREADY(out_V_TREADY),
+
+    .cfg_valid(cfg_valid),
+    .cfg_cntr_simd(cfg_cntr_simd),
+    .cfg_cntr_kw(cfg_cntr_kw),
+    .cfg_cntr_kh(cfg_cntr_kh),
+    .cfg_cntr_w(cfg_cntr_w),
+    .cfg_cntr_h(cfg_cntr_h),
+    .cfg_incr_head_simd(cfg_incr_head_simd),
+    .cfg_incr_head_kw(cfg_incr_head_kw),
+    .cfg_incr_head_kh(cfg_incr_head_kh),
+    .cfg_incr_head_w(cfg_incr_head_w),
+    .cfg_incr_head_h(cfg_incr_head_h),
+    .cfg_incr_tail_w(cfg_incr_tail_w),
+    .cfg_incr_tail_h(cfg_incr_tail_h),
+    .cfg_incr_tail_last(cfg_incr_tail_last),
+    .cfg_last_read(cfg_last_read),
+    .cfg_last_write(cfg_last_write)
+);
+
+endmodule //TOP_MODULE_NAME
diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb
index a4ad32ed7f547a4c035b5cbe4da11ebe2565883a..f8444520c3ded795702420d7f86335d0048ef043 100644
--- a/notebooks/advanced/0_custom_analysis_pass.ipynb
+++ b/notebooks/advanced/0_custom_analysis_pass.ipynb
@@ -137,7 +137,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb
index e40a534af56352712f20bfb250112aeacfee278f..391e852a71e1109b376abd7bb5d5f9d264d06498 100644
--- a/notebooks/advanced/1_custom_transformation_pass.ipynb
+++ b/notebooks/advanced/1_custom_transformation_pass.ipynb
@@ -233,7 +233,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index c27f8bdca788e6404fbc01e226b06e8cfaaba066..636da64dd52fab81f8d6a763d199e8e13e9e3cc0 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -8,14 +8,14 @@
     "\n",
     "Suppose that you want to introduce a new (custom) operation type into the FINN compiler. Custom operations in FINN are useful for a variety of things ranging from code generation to functional verification. This is achieved by creating a new Python module for your custom operation that fulfills certain interface specifications.\n",
     "\n",
-    "One thing to point out before we start is that **these custom operations are generic** and not really tied to e.g. Vivado HLS or few-bit quantization. As you will see in this notebook, it's possible to provide arbitrary Python/C/C++/... execution and code generation paths for custom nodes.\n",
+    "One thing to point out before we start is that **these custom operations are generic** and not really tied to e.g. Vitis HLS or few-bit quantization. As you will see in this notebook, it's possible to provide arbitrary Python/C/C++/... execution and code generation paths for custom nodes.\n",
     "\n",
     "## The CustomOp base class\n",
     "\n",
     "Subclasses of `CustomOp` provide a way of providing custom functionality for ONNX nodes in FINN.\n",
     "This is the base class for every custom op node used in the framework, so you must create subclasses of `CustomOp` to provide execution, code generation and other functionalities in FINN.\n",
     "\n",
-    "Let's start by looking at the `CustomOp` base class itself, which lives in the `finn-base` repository. You can view it [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/base.py). Note that the `finn` Docker container already has `finn-base` set up as a dependency.\n",
+    "Let's start by looking at the `CustomOp` base class itself, which lives in the `qonnx` repository. You can view it [here](https://github.com/fastmachinelearning/qonnx/blob/main/src/qonnx/custom_op/base.py). Note that the `finn` Docker container already has `qonnx` set up as a dependency.\n",
     "\n",
     "Some points of importance:\n",
     "\n",
@@ -23,7 +23,7 @@
     "\n",
     "2. `CustomOp` subclasses need to implement the methods below (those not starting with underscore).\n",
     "\n",
-    "3. To be discoverable in the custom op register, `CustomOp` subclasses must set the `domain` field to the name of the Python module they appear in. For instance, to use the custom `Im2Col` op type from [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/general/im2col.py), the ONNX node must use `domain=qonnx.custom_op.general` since its module is located at `finn/custom_op/general/im2col.py`."
+    "3. To be discoverable in the custom op register, `CustomOp` subclasses must set the `domain` field to the name of the Python module they appear in. For instance, to use the custom `Im2Col` op type from [here](https://github.com/fastmachinelearning/qonnx/blob/main/src/qonnx/custom_op/general/im2col.py), the ONNX node must use `domain=qonnx.custom_op.general` since its module is located at `qonnx/custom_op/general/im2col.py`."
    ]
   },
   {
@@ -130,7 +130,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To make sure our custom op is available, it needs to be registered. The best practice for this is to create a submodule under `finn.custom_op` which includes a `custom_op` dictionary that maps strings (op names) to classes (op implementations). Since we're in a Jupyter notebook we'll just hijack it at runtime like this:"
+    "To make sure our custom op is available, it needs to be registered. The best practice for this is to create a submodule under `qonnx.custom_op` which includes a `custom_op` dictionary that maps strings (op names) to classes (op implementations). Since we're in a Jupyter notebook we'll just hijack it at runtime like this:"
    ]
   },
   {
@@ -178,6 +178,7 @@
    "source": [
     "from qonnx.core.modelwrapper import ModelWrapper\n",
     "from onnx import TensorProto\n",
+    "from qonnx.util.basic import qonnx_make_model\n",
     "\n",
     "def make_graph(ishape, exp, op_type = \"MyPythonPowerOp\"):\n",
     "    inp = helper.make_tensor_value_info(\n",
@@ -204,7 +205,7 @@
     "    graph = helper.make_graph(\n",
     "        nodes=[custom_node], name=\"custom_graph\", inputs=[inp], outputs=[outp]\n",
     "    )\n",
-    "    model = helper.make_model(graph, producer_name=\"custom-model\")\n",
+    "    model = qonnx_make_model(graph, producer_name=\"custom-model\")\n",
     "    return ModelWrapper(model)"
    ]
   },
@@ -657,7 +658,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb
index 514efd1693d667af896e89902a264ea7e6e01da7..35a83ea97b87bbe78ae1ff58a5ee50a0b0420a8f 100644
--- a/notebooks/basics/0_how_to_work_with_onnx.ipynb
+++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb
@@ -24,7 +24,7 @@
    "source": [
     "### How to create a simple ONNX model\n",
     "\n",
-    "To explain how to create an ONNX model a simple example with mathematical operations is used. All nodes are from the [standard operations library of ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md).\n",
+    "To explain how to create an ONNX model a simple example with mathematical operations is used. All nodes are from the [standard operations library of ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md).\n",
     "\n",
     "First ONNX is imported, then the helper function can be used to make a node."
    ]
@@ -36,6 +36,7 @@
    "outputs": [],
    "source": [
     "import onnx\n",
+    "from qonnx.util.basic import qonnx_make_model\n",
     "\n",
     "Add1_node = onnx.helper.make_node(\n",
     "    'Add',\n",
@@ -158,7 +159,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "onnx_model = onnx.helper.make_model(graph, producer_name=\"simple-model\")\n",
+    "onnx_model = qonnx_make_model(graph, producer_name=\"simple-model\")\n",
     "onnx.save(onnx_model, '/tmp/simple_model.onnx')"
    ]
   },
@@ -304,7 +305,7 @@
    "source": [
     "### How to manipulate an ONNX model\n",
     "\n",
-    "In the model there are two successive adder nodes. An adder node in ONNX can only add two inputs, but there is also the [**sum**](https://github.com/onnx/onnx/blob/master/docs/Operators.md#Sum) node, which can process more than two inputs. So it would be a reasonable change of the graph to combine the two successive adder nodes to one sum node."
+    "In the model there are two successive adder nodes. An adder node in ONNX can only add two inputs, but there is also the [**sum**](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sum) node, which can process more than two inputs. So it would be a reasonable change of the graph to combine the two successive adder nodes to one sum node."
    ]
   },
   {
@@ -550,7 +551,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "onnx_model1 = onnx.helper.make_model(graph, producer_name=\"simple-model1\")\n",
+    "onnx_model1 = qonnx_make_model(graph, producer_name=\"simple-model1\")\n",
     "onnx.save(onnx_model1, '/tmp/simple_model1.onnx')"
    ]
   },
@@ -598,7 +599,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import.ipynb
index 5fb29754dc0ad56c2d07c783cf43102975b1621b..a884e90d7572789fc64cf9b953b5730590d4e8f1 100644
--- a/notebooks/basics/1_brevitas_network_import.ipynb
+++ b/notebooks/basics/1_brevitas_network_import.ipynb
@@ -297,7 +297,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index a2747e3921dc8e5a8427b4d5d9b7f143a57b018f..8ea6a3500955736ea3aaa803684b0e403b06a866 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -46,7 +46,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
+    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
     "There is an additional section for functional verification (red section) on the left side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n",
     "\n",
     "\n",
@@ -199,7 +199,7 @@
     "\n",
     "![](cnv-mp-fc.png)\n",
     "\n",
-    "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vivado HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n",
+    "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n",
     "\n",
     "\n",
     "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n",
@@ -359,21 +359,21 @@
     "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n",
     "folding = [\n",
-    "    (16, 3, 128),\n",
-    "    (32, 32, 128),\n",
-    "    (16, 32, 128),\n",
-    "    (16, 32, 128),\n",
-    "    (4, 32, 81),\n",
-    "    (1, 32, 2),\n",
-    "    (1, 4, 2),\n",
-    "    (1, 8, 128),\n",
-    "    (5, 1, 3),\n",
+    "    (16, 3, [128]),\n",
+    "    (32, 32, [128]),\n",
+    "    (16, 32, [128]),\n",
+    "    (16, 32, [128]),\n",
+    "    (4, 32, [81]),\n",
+    "    (1, 32, [2]),\n",
+    "    (1, 4, [2]),\n",
+    "    (1, 8, [128]),\n",
+    "    (5, 1, [3]),\n",
     "]\n",
     "for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):\n",
     "    fcl_inst = getCustomOp(fcl)\n",
     "    fcl_inst.set_nodeattr(\"PE\", pe)\n",
     "    fcl_inst.set_nodeattr(\"SIMD\", simd)\n",
-    "    fcl_inst.set_nodeattr(\"inFIFODepth\", ififodepth)\n",
+    "    fcl_inst.set_nodeattr(\"inFIFODepths\", ififodepth)\n",
     "\n",
     "# use same SIMD values for the sliding window operators\n",
     "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator\")\n",
@@ -462,11 +462,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 5. Deployment and Remote Execution\n",
+    "## 5. Deployment and Execution\n",
     "\n",
-    "Now that we're done with the hardware generation, we can copy the necessary files onto our PYNQ board.\n",
-    "\n",
-    "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
+    "The bitfile and generated driver files(s) will be copied into a deployment folder which then can be used to run the network on the PYNQ board."
    ]
   },
   {
@@ -475,33 +473,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
+    "from shutil import copy\n",
+    "from distutils.dir_util import copy_tree\n",
+    "\n",
+    "# create directory for deployment files\n",
+    "deployment_dir = make_build_dir(prefix=\"pynq_deployment_\")\n",
+    "model.set_metadata_prop(\"pynq_deployment_dir\", deployment_dir)\n",
     "\n",
-    "# set up the following values according to your own environment\n",
-    "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = \"192.168.2.99\"\n",
-    "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
-    "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
-    "port = os.getenv(\"PYNQ_PORT\", 22)\n",
-    "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn_cnv_end2end_example\")\n",
-    "# set up ssh options to only allow publickey authentication\n",
-    "options = \"-o PreferredAuthentications=publickey -o PasswordAuthentication=no\"\n",
+    "# get and copy necessary files\n",
+    "# .bit and .hwh file\n",
+    "bitfile = model.get_metadata_prop(\"bitfile\")\n",
+    "hwh_file = model.get_metadata_prop(\"hw_handoff\")\n",
+    "deploy_files = [bitfile, hwh_file]\n",
     "\n",
-    "# test access to PYNQ board\n",
-    "! ssh {options} {username}@{ip} -p {port} cat /var/run/motd.dynamic"
+    "for dfile in deploy_files:\n",
+    "    if dfile is not None:\n",
+    "        copy(dfile, deployment_dir)\n",
+    "\n",
+    "# driver.py and python libraries\n",
+    "pynq_driver_dir = model.get_metadata_prop(\"pynq_driver_dir\")\n",
+    "copy_tree(pynq_driver_dir, deployment_dir)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
-    "\n",
-    "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_synth.onnx\")\n",
-    "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
-    "model.save(build_dir + \"/end2end_cnv_w1a1_pynq_deploy.onnx\")"
+    "Next to these files, we will also need an example numpy array to test the network on the PYNQ board. (*and before you ask, that's supposed to be a cat (CIFAR-10 class number 3)*) Recall that we partitioned our original network into a parent graph that contained the non-synthesizable nodes and a child graph that contained the bulk of the network, which we turned into a bitfile. The only operator left outside the FPGA partition was a `Transpose` to convert NCHW images into NHWC ones. Thus, we can skip the execution in the parent as long as we ensure our image has the expected data layout. The example numpy array can then be saved as .npy file."
    ]
   },
   {
@@ -510,8 +508,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "target_dir_pynq = target_dir + \"/\" + model.get_metadata_prop(\"pynq_deployment_dir\").split(\"/\")[-1]\n",
-    "target_dir_pynq"
+    "import pkg_resources as pk\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "fn = pk.resource_filename(\"finn.qnn-data\", \"cifar10/cifar10-test-data-class3.npz\")\n",
+    "x = np.load(fn)[\"arr_0\"]\n",
+    "x = x.reshape(3, 32,32).transpose(1, 2, 0)\n",
+    "plt.imshow(x)"
    ]
   },
   {
@@ -520,14 +524,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
+    "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_pynq_deploy.onnx\")\n",
+    "iname = model.graph.input[0].name\n",
+    "ishape = model.get_tensor_shape(iname)\n",
+    "np.save(deployment_dir + \"/input.npy\", x.reshape(ishape))"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "We only have two more steps to be able to remotely execute the deployed bitfile with some test data from the CIFAR-10 dataset. Let's load up some test data that comes bundled with FINN -- *and before you ask, that's supposed to be a cat (CIFAR-10 class number 3)*."
+    "! ls {deployment_dir}"
    ]
   },
   {
@@ -536,54 +545,34 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pkg_resources as pk\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "\n",
-    "fn = pk.resource_filename(\"finn.qnn-data\", \"cifar10/cifar10-test-data-class3.npz\")\n",
-    "x = np.load(fn)[\"arr_0\"]\n",
-    "x = x.reshape(3, 32,32).transpose(1, 2, 0)\n",
-    "plt.imshow(x)"
+    "from shutil import make_archive\n",
+    "make_archive('deploy-on-pynq-cnv', 'zip', deployment_dir)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Recall that we partitioned our original network into a parent graph that contained the non-synthesizable nodes and a child graph that contained the bulk of the network, which we turned into a bitfile. The only operator left outside the FPGA partition was a `Transpose` to convert NCHW images into NHWC ones. Thus, we can skip the execution in the parent as long as we ensure our image has the expected data layout, which we have done above."
+    "You can now download the created zipfile (File -> Open, mark the checkbox next to the deploy-on-pynq-tfc.zip and select Download from the toolbar), then copy it to your PYNQ board (for instance via scp or rsync). Then, run the following commands on the PYNQ board to extract the archive and run the execution:"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "from finn.core.onnx_exec import execute_onnx\n",
-    "\n",
-    "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_pynq_deploy.onnx\")\n",
-    "iname = model.graph.input[0].name\n",
-    "oname = model.graph.output[0].name\n",
-    "ishape = model.get_tensor_shape(iname)\n",
-    "input_dict = {iname: x.astype(np.float32).reshape(ishape)}\n",
-    "ret = execute_onnx(model, input_dict, True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ret[oname]"
+    "```shell\n",
+    "unzip deploy-on-pynq-cnv.zip -d finn-cnv-demo\n",
+    "cd finn-cnv-demo\n",
+    "sudo python3 -m pip install bitstring\n",
+    "sudo python3 driver.py --exec_mode=execute --batchsize=1 --bitfile=resizer.bit --inputfile=input.npy\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the network correctly predicts this as a class 3 (\"cat\"). "
+    "The output will be saved on the PYNQ board as `output.npy` and can be copied to the host and opened with `np.load()`."
    ]
   },
   {
@@ -592,7 +581,7 @@
    "source": [
     "### Validating the Accuracy on a PYNQ Board <a id='validation'></a>\n",
     "\n",
-    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
+    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board.\n",
     "\n",
     "**Ensure that your PYNQ board has a working internet connecting for the next steps, since some there is some downloading involved.**\n",
     "\n",
@@ -601,16 +590,9 @@
     "\n",
     "Command to execute on PYNQ:\n",
     "\n",
-    "```pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
+    "```shell\n",
+    "sudo pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading\n",
+    "```"
    ]
   },
   {
@@ -621,16 +603,9 @@
     "\n",
     "Command to execute on PYNQ:\n",
     "\n",
-    "`python3.6 validate.py --dataset cifar10 --batchsize 1000`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset cifar10 --batchsize 1000'"
+    "```shell\n",
+    "sudo python3 validate.py --dataset cifar10 --batchsize 1000\n",
+    "```"
    ]
   },
   {
@@ -643,7 +618,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index a6f05df30925250df1704afb6f9ff9dc7dc17dc0..7e9980cf2abf40219847eab2c0cd381cc32f6682 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -33,7 +33,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
+    "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
     "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n",
     "\n",
     "\n",
@@ -161,7 +161,7 @@
     "\n",
     "![](finn-hw-arch.png)\n",
     "\n",
-    "In practice, the compute arrays are instantiated by function calls to optimized Vivado HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process."
+    "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process."
    ]
   },
   {
@@ -248,7 +248,7 @@
     "\n",
     "In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n",
     "\n",
-    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L104), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
+    "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L86), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
    ]
   },
   {
@@ -343,7 +343,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As can be seen, several transformations are involved in the streamlining transformation. There are move and collapse transformations. In the last step the operations are transformed into multithresholds. The involved transformations can be viewed in detail [here](https://github.com/Xilinx/finn/tree/master/src/finn/transformation/streamline). After each transformation, three of the tidy-up transformations (`GiveUniqueNodeNames`, `GiveReadableTensorNames` and `InferDataTypes`) are applied to the model.\n",
+    "As can be seen, several transformations are involved in the streamlining transformation. There are move and collapse transformations. In the last step the operations are transformed into multithresholds. The involved transformations can be viewed in detail [here](https://github.com/Xilinx/finn/tree/main/src/finn/transformation/streamline). After each transformation, three of the tidy-up transformations (`GiveUniqueNodeNames`, `GiveReadableTensorNames` and `InferDataTypes`) are applied to the model.\n",
     "\n",
     "After streamlining the network looks as follows:"
    ]
@@ -525,7 +525,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/__init__.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
+    "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/hlscustomop.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
    ]
   },
   {
@@ -547,7 +547,7 @@
    "metadata": {},
    "source": [
     "We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using `set_nodeattr` subject to certain constraints. There are also a lot of additional attributes that can be set for this node type.\n",
-    "**In this notebook we are setting the folding factors and FIFO depths manually, but in a future version we will support determining the folding factors given an FPGA resource budget according to the analytical model from the [FINN-R paper](https://arxiv.org/pdf/1809.04570).**"
+    "**In this notebook we are setting the folding factors and FIFO depths manually but it is possible to use FINN transformations for this ([SetFolding](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_folding.py) and [InsertAndSetFIFODepths](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_fifo_depths.py)).**"
    ]
   },
   {
@@ -559,17 +559,17 @@
     "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
     "# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n",
     "config = [\n",
-    "    (16, 49, 16, 64, \"block\"),\n",
-    "    (8, 8, 64, 64, \"auto\"),\n",
-    "    (8, 8, 64, 64, \"auto\"),\n",
-    "    (10, 8, 64, 10, \"distributed\"),\n",
+    "    (16, 49, [16], [64], \"block\"),\n",
+    "    (8, 8, [64], [64], \"auto\"),\n",
+    "    (8, 8, [64], [64], \"auto\"),\n",
+    "    (10, 8, [64], [10], \"distributed\"),\n",
     "]\n",
     "for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):\n",
     "    fcl_inst = getCustomOp(fcl)\n",
     "    fcl_inst.set_nodeattr(\"PE\", pe)\n",
     "    fcl_inst.set_nodeattr(\"SIMD\", simd)\n",
-    "    fcl_inst.set_nodeattr(\"inFIFODepth\", ififo)\n",
-    "    fcl_inst.set_nodeattr(\"outFIFODepth\", ofifo)\n",
+    "    fcl_inst.set_nodeattr(\"inFIFODepths\", ififo)\n",
+    "    fcl_inst.set_nodeattr(\"outFIFODepths\", ofifo)\n",
     "    fcl_inst.set_nodeattr(\"ram_style\", ramstyle)\n",
     "    \n",
     "# set parallelism for input quantizer to be same as first layer's SIMD\n",
@@ -590,7 +590,7 @@
    "metadata": {},
    "source": [
     "Besides PE and SIMD three other node attributes are set. `ram_style` specifies how the weights are to be stored (BRAM, LUTRAM, and so on). It can be selected explicitly or with the option `auto` you can let Vivado decide.\n",
-    "`inFIFODepth` and `outFIFODepth` specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.\n",
+    "`inFIFODepths` and `outFIFODepths` specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.\n",
     "\n",
     "In previous versions of FINN we had to call transformations to insert data width converters, FIFOs and `TLastMarker` manually at this step. This is no longer needed, as all this is taken care of by the `ZynqBuild` or `VitisBuild` transformations."
    ]
@@ -609,7 +609,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This completes the network preparation and the network can be passed on to the next block *Vivado HLS and IPI*, which is described below."
+    "This completes the network preparation and the network can be passed on to the next block *Vitis HLS and IPI*, which is described below."
    ]
   },
   {
@@ -798,23 +798,21 @@
    "source": [
     "## 4.  PYNQ deployment <a id='hw_test'></a>\n",
     "\n",
-    "* [Deployment and Remote Execution](#deploy)\n",
+    "* [Deployment](#deploy)\n",
     "* [Validation on PYNQ Board](#validation)\n",
     "* [Throughput Test on PYNQ Board](#throughput)\n",
     "\n",
     "\n",
-    "We are almost done preparing our hardware design. We'll now put it in a form suitable for use as a PYNQ overlay, synthesize and deploy it."
+    "The bitfile and generated driver will be copied together with some necessary files for execution into a deployment folder which then can be used to run the network on the PYNQ board."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Deployment and Remote Execution <a id='deploy'></a>\n",
+    "### Deployment <a id='deploy'></a>\n",
     "\n",
-    "We'll now use the `DeployToPYNQ` transformation to create a deployment folder with the bitfile and driver file(s), and copy that to the PYNQ board. You can change the default IP address, username, password and target folder for the PYNQ below.\n",
-    "\n",
-    "**Make sure you've [set up the SSH keys for your PYNQ board](https://finn-dev.readthedocs.io/en/latest/getting_started.html#pynq-board-first-time-setup) before executing this step.**"
+    "We'll now create a deployment folder with the bitfile and driver file(s), we zip it and afterwards it can be copied to the PYNQ board for execution and validation."
    ]
   },
   {
@@ -823,74 +821,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
+    "from shutil import copy\n",
+    "from distutils.dir_util import copy_tree\n",
     "\n",
-    "# set up the following values according to your own environment\n",
-    "# FINN will use ssh to deploy and run the generated accelerator\n",
-    "ip = \"192.168.2.99\"\n",
-    "username = os.getenv(\"PYNQ_USERNAME\", \"xilinx\")\n",
-    "password = os.getenv(\"PYNQ_PASSWORD\", \"xilinx\")\n",
-    "port = os.getenv(\"PYNQ_PORT\", 22)\n",
-    "target_dir = os.getenv(\"PYNQ_TARGET_DIR\", \"/home/xilinx/finn_tfc_end2end_example\")\n",
-    "# set up ssh options to only allow publickey authentication\n",
-    "options = \"-o PreferredAuthentications=publickey -o PasswordAuthentication=no\"\n",
+    "# create directory for deployment files\n",
+    "deployment_dir = make_build_dir(prefix=\"pynq_deployment_\")\n",
+    "model.set_metadata_prop(\"pynq_deployment_dir\", deployment_dir)\n",
     "\n",
-    "# test access to PYNQ board\n",
-    "! ssh {options} {username}@{ip} -p {port} cat /var/run/motd.dynamic"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ\n",
+    "# get and copy necessary files\n",
+    "# .bit and .hwh file\n",
+    "bitfile = model.get_metadata_prop(\"bitfile\")\n",
+    "hwh_file = model.get_metadata_prop(\"hw_handoff\")\n",
+    "deploy_files = [bitfile, hwh_file]\n",
     "\n",
-    "model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))\n",
-    "model.save(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's verify that the remote access credentials is saved in the model metadata, and that the deployment folder has been successfully copied to the board:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.model.metadata_props"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "target_dir_pynq = target_dir + \"/\" + model.get_metadata_prop(\"pynq_deployment_dir\").split(\"/\")[-1]\n",
-    "target_dir_pynq"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'"
+    "for dfile in deploy_files:\n",
+    "    if dfile is not None:\n",
+    "        copy(dfile, deployment_dir)\n",
+    "\n",
+    "# driver.py and python libraries\n",
+    "pynq_driver_dir = model.get_metadata_prop(\"pynq_driver_dir\")\n",
+    "copy_tree(pynq_driver_dir, deployment_dir)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We only have two more steps to be able to remotely execute the deployed bitfile with some test data from the MNIST dataset. Let's load up some test data that comes bundled with FINN."
+    "Next to these files, we will also need an example numpy array to test the network on the PYNQ board. You may recall that one \"reshape\" node was left out of the StreamingDataflowPartition. We'll do that manually with a numpy function call when passing in the input, but everything else in the network ended up inside the StreamingDataflowPartition so that's all we need to do. The example numpy array can then be saved as .npy file. "
    ]
   },
   {
@@ -914,18 +871,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import numpy as np\n",
+    "\n",
     "model = ModelWrapper(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")\n",
     "iname = model.graph.input[0].name\n",
     "oname = parent_model.graph.output[0].name\n",
     "ishape = model.get_tensor_shape(iname)\n",
-    "print(\"Expected network input shape is \" + str(ishape))"
+    "print(\"Expected network input shape is \" + str(ishape))\n",
+    "np.save(deployment_dir + \"/input.npy\", x.reshape(ishape))"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "Finally, we can call `execute_onnx` on the graph, which will internally call remote execution with the bitfile, grab the results and return a numpy array. You may recall that one \"reshape\" node was left out of the StreamingDataflowPartition. We'll do that manually with a numpy function call when passing in the input, but everything else in the network ended up inside the StreamingDataflowPartition so that's all we need to do."
+    "! ls {deployment_dir}"
    ]
   },
   {
@@ -934,27 +896,34 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "from finn.core.onnx_exec import execute_onnx\n",
-    "\n",
-    "input_dict = {iname: x.reshape(ishape)}\n",
-    "ret = execute_onnx(model, input_dict)"
+    "from shutil import make_archive\n",
+    "make_archive('deploy-on-pynq-tfc', 'zip', deployment_dir)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can now download the created zipfile (**File -> Open**, mark the checkbox next to the `deploy-on-pynq-tfc.zip` and select Download from the toolbar), then copy it to your PYNQ board (for instance via `scp` or `rsync`). Then, run the following commands **on the PYNQ board** to extract the archive and run the execution:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ret[oname]"
+    "```shell\n",
+    "unzip deploy-on-pynq-tfc.zip -d finn-tfc-demo\n",
+    "cd finn-tfc-demo\n",
+    "sudo python3 -m pip install bitstring\n",
+    "sudo python3 driver.py --exec_mode=execute --batchsize=1 --bitfile=resizer.bit --inputfile=input.npy\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We see that the network correctly predicts this as a digit 2."
+    "The output will be saved on the PYNQ board as `output.npy` and can be copied to the host and opened with `np.load()`."
    ]
   },
   {
@@ -963,25 +932,16 @@
    "source": [
     "### Validating the Accuracy on a PYNQ Board <a id='validation'></a>\n",
     "\n",
-    "All the command line prompts here are meant to be executed with `sudo` on the PYNQ board, so we'll use a workaround (`echo password | sudo -S command`) to get that working from this notebook running on the host computer.\n",
-    "\n",
     "**Ensure that your PYNQ board has a working internet connecting for the next steps, since there is some downloading involved.**\n",
     "\n",
     "To validate the accuracy, we first need to install the [`dataset-loading`](https://github.com/fbcotter/dataset_loading) Python package to the PYNQ board. This will give us a convenient way of downloading and accessing the MNIST dataset.\n",
     "\n",
     "\n",
-    "Command to execute on PYNQ:\n",
+    "Command to execute on PYNQ board:\n",
     "\n",
-    "```sudo pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'echo {password} | sudo -S pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading'"
+    "```shell\n",
+    "sudo pip3 install git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading\n",
+    "```"
    ]
   },
   {
@@ -990,18 +950,11 @@
    "source": [
     "We can now use the `validate.py` script that was generated together with the driver to measure top-1 accuracy on the MNIST dataset.\n",
     "\n",
-    "Command to execute on PYNQ:\n",
+    "Command to execute on PYNQ board:\n",
     "\n",
-    "`python3.6 validate.py --dataset mnist --batchsize 1000`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "! ssh {options} -t {username}@{ip} -p {port} 'cd {target_dir_pynq}; echo {password} | sudo -S python3.6 validate.py --dataset mnist --batchsize 1000'"
+    "```shell\n",
+    "sudo python3 validate.py --dataset mnist --batchsize 1000\n",
+    "```"
    ]
   },
   {
@@ -1016,60 +969,30 @@
    "metadata": {},
    "source": [
     "### Throughput Test on PYNQ Board <a id='throughput'></a>\n",
-    "In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done using the core function `throughput_test`. In the next section we import the function and execute it.\n",
-    "First we extract the `remote_exec_model` again and pass it to the function. The function returns the metrics of the network as dictionary. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from finn.core.throughput_test import throughput_test_remote\n",
-    "\n",
-    "model = ModelWrapper(build_dir + \"/tfc_w1_a1_pynq_deploy.onnx\")\n",
-    "res = throughput_test_remote(model, 10000)\n",
-    "print(\"Network metrics:\")\n",
-    "for key in res:\n",
-    "    print(str(key) + \": \" + str(res[key]))"
+    "In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done setting the `exec_mode` to `throughput_test`. \n",
+    "Command to execute on PYNQ board:"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Together with the values for folding we can evaluate the performance of our accelerator. Each layer has a total folding factor of 64 and because the network is fully pipelined, it follows: `II = 64`. II is the initiation interval and indicates how many cycles are needed for one input to be processed. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "II = 64\n",
-    "# frequency in MHz\n",
-    "f_MHz = 100\n",
-    "# expected throughput in MFPS\n",
-    "expected_throughput = f_MHz / II\n",
-    "# measured throughput (FPS) from throughput test, converted to MFPS\n",
-    "measured_throughput = res[\"throughput[images/s]\"] * 0.000001\n",
-    "# peformance\n",
-    "print(\"We reach approximately \" + str(round((measured_throughput / expected_throughput)*100)) + \"% of the ideal performance.\")"
+    "```shell\n",
+    "sudo python3 driver.py --exec_mode=throughput_test --batchsize=1000 --bitfile=resizer.bit\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The measured values were recorded with a batch size of 10000 and at a frequency of 100 MHz. We will be improving the efficiency of the generated accelerator examples in the coming FINN releases."
+    "The network metrics from the throughput test are saved in a file called `nw_metrics.txt` on the PYNQ board. Which can be investigated after running the command above."
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index 813127197e07e4ddb5ec5ff39aed0278e117babc..6c3b7965098e013fa35ac5f5b2b481e678d68f5d 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -61,7 +61,7 @@
     "fc = get_test_model_trained(\"TFC\", 1, 1)\n",
     "raw_i = get_data(\"qonnx.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
-    "input_brevitas = torch.from_numpy(nph.to_array(input_tensor)).float()\n",
+    "input_brevitas = torch.from_numpy(nph.to_array(input_tensor).copy()).float()\n",
     "output_golden = fc.forward(input_brevitas).detach().numpy()\n",
     "output_golden"
    ]
@@ -72,7 +72,7 @@
    "source": [
     "## Simulation using Python <a id='simpy'></a>\n",
     "\n",
-    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n",
+    "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n",
     "\n",
     "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n"
    ]
@@ -383,7 +383,15 @@
     "\n",
     "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n",
     "child_model = child_model.transform(InsertDWC())\n",
-    "child_model = child_model.transform(InsertFIFO())\n",
+    "\n",
+    "# set all impl_styles of the DWCs to hls to enable emulation\n",
+    "dwc_nodes = child_model.get_nodes_by_op_type(\"StreamingDataWidthConverter_Batch\")\n",
+    "for dwc in dwc_nodes:\n",
+    "    dwc_inst = getCustomOp(dwc)\n",
+    "    dwc_inst.set_nodeattr(\"impl_style\", \"hls\")\n",
+    "    \n",
+    "child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n",
+    "child_model.save(build_dir + \"/test.onnx\");\n",
     "child_model = child_model.transform(GiveUniqueNodeNames())\n",
     "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n",
     "child_model = child_model.transform(HLSSynthIP())\n",
@@ -431,7 +439,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 5625a6f1c20ee5e4a66df28931a6a891f699a738..3d77586258b9ddb64985e7f7b7a2215565839c50 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -741,7 +741,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 370312c77e90c67a3095e0800ad0c6046bfd75f4..e4848a1f40bed5865eccc1d831a634ac5f54e965 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -381,7 +381,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 33adb68dc8ddfff1b427d82e4666a70e883bf2c8..a18cafd6044328d53139acafb2be2cf73a4ec9b6 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -624,7 +624,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/requirements.txt b/requirements.txt
index 9038a5e8170301421529e0b570482316e4fff20a..83aad07d729e30cbbbaf565b4332fb1f7ae6f014 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,14 +2,14 @@ bitstring==3.1.7
 clize==4.1.1
 dataclasses-json==0.5.7
 docrep==0.2.7
-future==0.18.2
 gspread==3.6.0
 numpy==1.22.0
-onnx==1.11.0
+onnx==1.13.0
 onnxoptimizer
 onnxruntime==1.11.1
 pre-commit==2.9.2
-protobuf==3.20.2
+protobuf==3.20.3
+psutil==5.9.4
 pyscaffold==3.2.1
 scipy==1.5.2
 setupext-janitor>=1.1.2
diff --git a/setup.cfg b/setup.cfg
index a1d0fef6cb08994ae8666fd2ea37166bf1cd3752..1893aa42316dad341fcedbd527f5abcf482e5cfb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -72,18 +72,20 @@ exclude =
 # Add here additional requirements for extra features, to install with:
 # `pip install FINN[PDF]` like:
 # PDF = ReportLab; RXP
-# finn-base is needed to build the full set of docs
+# qonnx is needed to build the full set of docs
 docs =
-    finn-base==0.0.3
     docutils==0.17.1
     dataclasses-json==0.5.7
     gspread==3.6.0
+    IPython
     pytest
     netron
     vcdvcd
     torchvision
     torch
     qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx
+    pyverilator@git+https://github.com/maltanar/pyverilator@master#egg=pyverilator
+    brevitas@git+https://github.com/Xilinx/brevitas@master#egg=brevitas_examples
 
 # Add here test requirements (semicolon/line-separated)
 testing =
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 238083f653d410772a81115ff12dd987835d1f32..d6864994a70a0ea4c24567155ff7c0599bc0fb6f 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -155,12 +155,14 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
                 % (step_name, step_num, len(build_dataflow_steps))
             )
             # redirect output to logfile
-            sys.stdout = stdout_logger
-            sys.stderr = stderr_logger
-            print(
-                "Running step: %s [%d/%d]"
-                % (step_name, step_num, len(build_dataflow_steps))
-            )
+            if not cfg.verbose:
+                sys.stdout = stdout_logger
+                sys.stderr = stderr_logger
+                # also log current step name to logfile
+                print(
+                    "Running step: %s [%d/%d]"
+                    % (step_name, step_num, len(build_dataflow_steps))
+                )
             # run the step
             step_start = time.time()
             model = transform_step(model, cfg)
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 09e9ec3a564dc2b459cd1ea3205e541f922b1af0..a38cb6e572d683871a924330742a1859b6fbe75d 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -37,6 +37,13 @@ from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
 from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
 
 
+class AutoFIFOSizingMethod(str, Enum):
+    "Select the type of automatic FIFO sizing strategy."
+
+    CHARACTERIZE = "characterize"
+    LARGEFIFO_RTLSIM = "largefifo_rtlsim"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -246,6 +253,20 @@ class DataflowBuildConfig:
     #: for each FIFO.
     auto_fifo_depths: Optional[bool] = True
 
+    #: Whether FIFO nodes with depth larger than 32768 will be split.
+    #: Allow to configure very large FIFOs in the folding_config_file.
+    split_large_fifos: Optional[bool] = False
+
+    #: When `auto_fifo_depths = True`, select which method will be used for
+    #: setting the FIFO sizes.
+    auto_fifo_strategy: Optional[
+        AutoFIFOSizingMethod
+    ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
+
+    #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
+    #: if set to True, always using Python instead
+    force_python_rtlsim: Optional[bool] = False
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
@@ -258,6 +279,10 @@ class DataflowBuildConfig:
     #: Which memory mode will be used for compute layers
     default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED
 
+    #: Force inference of RTL ConvolutionInputGenerator over HLS implementation
+    #: If set to False, falls back to the default behavior of InferConvInpGen()
+    force_rtl_conv_inp_gen: Optional[bool] = False
+
     #: Which Vitis platform will be used.
     #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
     #: e.g. "xilinx_u250_xdma_201830_2"
@@ -285,6 +310,10 @@ class DataflowBuildConfig:
     #: Whether pdb postmortem debuggig will be launched when the build fails
     enable_build_pdb_debug: Optional[bool] = True
 
+    #: When True, all warnings and compiler output will be printed in stdout.
+    #: Otherwise, these will be suppressed and only appear in the build log.
+    verbose: Optional[bool] = False
+
     #: If given, only run the steps in the list. If not, run default steps.
     #: See `default_build_dataflow_steps` for the default list of steps.
     #: When specified:
@@ -312,6 +341,10 @@ class DataflowBuildConfig:
     #: Override the number of inputs for rtlsim performance measurement.
     rtlsim_batch_size: Optional[int] = 1
 
+    #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during
+    #: rtlsim, otherwise they will be replaced by HLS implementations.
+    rtlsim_use_vivado_comps: Optional[bool] = True
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 59f77650da5c3c3f9db0ea65e2288544b376bec3..2ee898bc7d50822f962b6a70cf86b2893e0937b7 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -29,6 +29,8 @@
 import json
 import numpy as np
 import os
+import shutil
+import warnings
 from copy import deepcopy
 from distutils.dir_util import copy_tree
 from qonnx.core.modelwrapper import ModelWrapper
@@ -78,6 +80,10 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.derive_characteristic import (
+    DeriveCharacteristic,
+    DeriveFIFOSizes,
+)
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
@@ -85,6 +91,7 @@ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
@@ -92,6 +99,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import (
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
+    SplitLargeFIFOs,
 )
 from finn.transformation.fpgadataflow.set_folding import SetFolding
 from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
@@ -107,6 +115,7 @@ from finn.util.basic import (
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
 )
+from finn.util.pyverilator import verilator_fifosim
 from finn.util.test import execute_parent
 
 
@@ -121,81 +130,126 @@ def verify_step(
     verify_out_dir = cfg.output_dir + "/verification_output"
     intermediate_models_dir = cfg.output_dir + "/intermediate_models"
     os.makedirs(verify_out_dir, exist_ok=True)
-    (in_npy, exp_out_npy) = cfg._resolve_verification_io_pair()
-    if need_parent:
-        assert (
-            cfg.save_intermediate_models
-        ), "Enable save_intermediate_models for verification"
-        parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
-        child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
-        model.save(child_model_fn)
-        out_tensor_name = ModelWrapper(parent_model_fn).graph.output[0].name
-        out_dict = execute_parent(
-            parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
-        )
-        out_npy = out_dict[out_tensor_name]
-    else:
-        inp_tensor_name = model.graph.input[0].name
-        out_tensor_name = model.graph.output[0].name
-        inp_dict = {inp_tensor_name: in_npy}
-        if rtlsim_pre_hook is not None:
-            out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook)
+    (in_npy_all, exp_out_npy_all) = cfg._resolve_verification_io_pair()
+    bsize_in = in_npy_all.shape[0]
+    bsize_out = exp_out_npy_all.shape[0]
+    assert bsize_in == bsize_out, "Batch sizes don't match for verification IO pair"
+    all_res = True
+    for b in range(bsize_in):
+        in_npy = np.expand_dims(in_npy_all[b], axis=0)
+        exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0)
+        if need_parent:
+            assert (
+                cfg.save_intermediate_models
+            ), "Enable save_intermediate_models for verification"
+            parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
+            child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
+            model.save(child_model_fn)
+            parent_model = ModelWrapper(parent_model_fn)
+            out_tensor_name = parent_model.graph.output[0].name
+            exp_ishape = parent_model.get_tensor_shape(parent_model.graph.input[0].name)
+            if in_npy.shape != exp_ishape:
+                print(
+                    "Verification input has shape %s while model expects %s"
+                    % (str(in_npy.shape), str(exp_ishape))
+                )
+                print("Attempting to force model shape on verification input")
+                in_npy = in_npy.reshape(exp_ishape)
+            out_dict = execute_parent(
+                parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
+            )
+            out_npy = out_dict[out_tensor_name]
         else:
-            out_dict = execute_onnx(model, inp_dict, True)
-        out_npy = out_dict[out_tensor_name]
-    res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
-    res_to_str = {True: "SUCCESS", False: "FAIL"}
-    res_str = res_to_str[res]
-    if cfg.verify_save_full_context:
-        verification_output_fn = verify_out_dir + "/verify_%s_%s.npz" % (
-            step_name,
-            res_str,
-        )
-        np.savez(verification_output_fn, **out_dict)
-    else:
-        verification_output_fn = verify_out_dir + "/verify_%s_%s.npy" % (
-            step_name,
-            res_str,
-        )
-        np.save(verification_output_fn, out_npy)
-    print("Verification for %s : %s" % (step_name, res_str))
+            inp_tensor_name = model.graph.input[0].name
+            out_tensor_name = model.graph.output[0].name
+            exp_ishape = model.get_tensor_shape(inp_tensor_name)
+            if in_npy.shape != exp_ishape:
+                print(
+                    "Verification input has shape %s while model expects %s"
+                    % (str(in_npy.shape), str(exp_ishape))
+                )
+                print("Attempting to force model shape on verification input")
+                in_npy = in_npy.reshape(exp_ishape)
+            inp_dict = {inp_tensor_name: in_npy}
+            if rtlsim_pre_hook is not None:
+                out_dict = rtlsim_exec(model, inp_dict, pre_hook=rtlsim_pre_hook)
+            else:
+                out_dict = execute_onnx(model, inp_dict, True)
+            out_npy = out_dict[out_tensor_name]
+        exp_oshape = exp_out_npy.shape
+        if out_npy.shape != exp_oshape:
+            print(
+                "Verification output has shape %s while model produces %s"
+                % (str(exp_oshape), str(out_npy.shape))
+            )
+            print("Attempting to force model shape on verification output")
+            out_npy = out_npy.reshape(exp_oshape)
+
+        res = np.isclose(exp_out_npy, out_npy, atol=1e-3).all()
+        all_res = all_res and res
+        res_to_str = {True: "SUCCESS", False: "FAIL"}
+        res_str = res_to_str[res]
+        if cfg.verify_save_full_context:
+            verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npz" % (
+                step_name,
+                b,
+                res_str,
+            )
+            np.savez(verification_output_fn, **out_dict)
+        else:
+            verification_output_fn = verify_out_dir + "/verify_%s_%d_%s.npy" % (
+                step_name,
+                b,
+                res_str,
+            )
+            np.save(verification_output_fn, out_npy)
+        if cfg.verify_save_rtlsim_waveforms:
+            vcd_path = model.get_metadata_prop("rtlsim_trace")
+            if vcd_path is not None and os.path.isfile(vcd_path):
+                new_vcd_path = vcd_path.replace(".vcd", "_%d.vcd" % b)
+                shutil.move(vcd_path, new_vcd_path)
+    print("Verification for %s : %s" % (step_name, res_to_str[all_res]))
 
 
 def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
-    need_restitch = False
-    # rtlsim only supports certain impl_style for some nodes
-    # StreamingFIFO must have impl_style=rtl
-    for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
-        inst = getCustomOp(fifo_layer)
-        if inst.get_nodeattr("impl_style") != "rtl":
-            inst.set_nodeattr("impl_style", "rtl")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # StreamingDataWidthConverter must have impl_style=hls
-    for dwc_layer in verify_model.get_nodes_by_op_type(
-        "StreamingDataWidthConverter_Batch"
-    ):
-        inst = getCustomOp(dwc_layer)
-        if inst.get_nodeattr("impl_style") != "hls":
-            inst.set_nodeattr("impl_style", "hls")
-            inst.set_nodeattr("code_gen_dir_ipgen", "")
-            inst.set_nodeattr("ipgen_path", "")
-            need_restitch = True
-    # if we've made alterations to the model, need to do some re-prep
-    if need_restitch:
-        print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
-        verify_model = verify_model.transform(
-            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
-        )
-        verify_model = verify_model.transform(HLSSynthIP())
-        verify_model = verify_model.transform(
-            CreateStitchedIP(
-                cfg._resolve_fpga_part(),
-                cfg.synth_clk_period_ns,
-                vitis=False,
+    if not cfg.rtlsim_use_vivado_comps:
+        need_restitch = False
+        # switch impl_style=vivado components to rtl/hls
+        # StreamingFIFO must have impl_style=rtl
+        for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+            inst = getCustomOp(fifo_layer)
+            if inst.get_nodeattr("impl_style") != "rtl":
+                inst.set_nodeattr("impl_style", "rtl")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # StreamingDataWidthConverter must have impl_style=hls
+        for dwc_layer in verify_model.get_nodes_by_op_type(
+            "StreamingDataWidthConverter_Batch"
+        ):
+            inst = getCustomOp(dwc_layer)
+            if inst.get_nodeattr("impl_style") != "hls":
+                inst.set_nodeattr("impl_style", "hls")
+                inst.set_nodeattr("code_gen_dir_ipgen", "")
+                inst.set_nodeattr("ipgen_path", "")
+                need_restitch = True
+        # if we've made alterations to the model, need to do some re-prep
+        if need_restitch:
+            print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
+            verify_model = verify_model.transform(
+                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
             )
-        )
+            verify_model = verify_model.transform(HLSSynthIP())
+            verify_model = verify_model.transform(
+                CreateStitchedIP(
+                    cfg._resolve_fpga_part(),
+                    cfg.synth_clk_period_ns,
+                    vitis=False,
+                )
+            )
+    else:
+        print("rtlsim_use_vivado_comps is enabled, may yield incorrect results")
+
     # set top-level prop for stitched-ip rtlsim and launch
     verify_model.set_metadata_prop("exec_mode", "rtlsim")
     # TODO make configurable
@@ -302,7 +356,10 @@ def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig):
     # needed for convolutions -- TODO always exec?
     need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0
     if need_conv:
-        model = model.transform(to_hls.InferConvInpGen())
+        if cfg.force_rtl_conv_inp_gen:
+            model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+        else:
+            model = model.transform(to_hls.InferConvInpGen())
         model = model.transform(to_hls.InferStreamingMaxPool())
         model = model.transform(RemoveCNVtoFCFlatten())
     # get rid of Tranpose -> Tranpose identity seq
@@ -446,9 +503,9 @@ def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
 def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
     Depending on the auto_fifo_depths setting, do one of the following:
-    * if auto_fifo_depths=True:  Run the `InsertAndSetFIFODepths` transformation
-    to attempt to determine the FIFO sizes that provide full throughput. Involves
-    running stitched-IP rtlsim and may take a long time.
+    * if auto_fifo_depths=True:  Run the appropriate auto-sizing transformation
+    to attempt to determine the FIFO sizes that provide full throughput.
+    May take a long time.
     * if auto_fifo_depths=False:  Assume the folding config file contains FIFO
     sizes as well. Runs the `InsertFIFO` transformation, then
     `ApplyConfig(cfg.folding_config_file)`, and finally `RemoveShallowFIFOs`.
@@ -457,13 +514,48 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
 
     if cfg.auto_fifo_depths:
-        model = model.transform(
-            InsertAndSetFIFODepths(
-                cfg._resolve_fpga_part(),
-                cfg._resolve_hls_clk_period(),
-                vivado_ram_style=cfg.large_fifo_mem_style,
+        if cfg.auto_fifo_strategy == "characterize":
+            model = model.transform(InsertDWC())
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(
+                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
             )
-        )
+            model = model.transform(HLSSynthIP())
+            model = model.transform(PrepareRTLSim())
+            model = model.transform(AnnotateCycles())
+            period = model.analysis(dataflow_performance)["max_cycles"] + 10
+            model = model.transform(DeriveCharacteristic(period))
+            model = model.transform(DeriveFIFOSizes())
+            model = model.transform(
+                InsertFIFO(
+                    vivado_ram_style=cfg.large_fifo_mem_style,
+                    max_qsrl_depth=256,
+                    create_shallow_fifos=True,
+                )
+            )
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(GiveReadableTensorNames())
+        elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
+            # multi-in/out streams currently not supported in our C++ verilator driver
+            model_multi_io = len(model.graph.input) > 1 or len(model.graph.output) > 1
+            force_python_sim = model_multi_io or cfg.force_python_rtlsim
+            if model_multi_io:
+                warnings.warn(
+                    "Multi-in/out streams currently not supported "
+                    + "in FINN C++ verilator driver, falling back to Python"
+                )
+            model = model.transform(
+                InsertAndSetFIFODepths(
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                    vivado_ram_style=cfg.large_fifo_mem_style,
+                    force_python_sim=force_python_sim,
+                )
+            )
+            # InsertAndSetFIFODepths internally removes any shallow FIFOs
+            # so no need to call RemoveShallowFIFOs here
+        else:
+            assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy
     else:
         # assume folding cfg json contains FIFO sizes too
         # insert DWCs, FIFOs and run ApplyConfig once more
@@ -475,8 +567,6 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(GiveReadableTensorNames())
         if cfg.folding_config_file is not None:
             model = model.transform(ApplyConfig(cfg.folding_config_file))
-        # remove any shallow FIFOs
-        model = model.transform(RemoveShallowFIFOs())
 
     # extract the final configuration and save it as json
     hw_attrs = [
@@ -488,11 +578,20 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         "resType",
         "mem_mode",
         "runtime_writeable_weights",
+        "inFIFODepths",
+        "outFIFODepths",
     ]
     extract_model_config_to_json(
         model, cfg.output_dir + "/final_hw_config.json", hw_attrs
     )
 
+    # perform FIFO splitting and shallow FIFO removal only after the final config
+    # json file has been written. otherwise, since these transforms may add/remove
+    # FIFOs, we get name mismatch problems when trying to reuse the final config.
+    if cfg.split_large_fifos:
+        model = model.transform(SplitLargeFIFOs())
+    model = model.transform(RemoveShallowFIFOs())
+
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
     model = model.transform(
@@ -556,20 +655,48 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
         # prepare ip-stitched rtlsim
         rtlsim_model = deepcopy(model)
         rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg)
-        # run with single input to get latency
-        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+        # multi-in/out streams currently not supported in our C++ verilator driver
+        model_multi_io = (
+            len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1
+        )
+        force_python_rtlsim = cfg.force_python_rtlsim or model_multi_io
+        if model_multi_io:
+            warnings.warn(
+                "Multi-in/out streams currently not supported "
+                + "in FINN C++ verilator driver, falling back to Python"
+            )
         rtlsim_bs = int(cfg.rtlsim_batch_size)
-        assert rtlsim_bs > 0, "rtlsim batch size must be >0"
-        if cfg.verify_save_rtlsim_waveforms:
-            # set depth to 3 for layer-by-layer visibility
-            os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+        orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
+        if force_python_rtlsim:
+            # run with single input to get latency
+            assert rtlsim_bs > 0, "rtlsim batch size must be >0"
+            if cfg.verify_save_rtlsim_waveforms:
+                # set depth to 3 for layer-by-layer visibility
+                os.environ["RTLSIM_TRACE_DEPTH"] = "3"
+                rtlsim_model.set_metadata_prop(
+                    "rtlsim_trace",
+                    "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs),
+                )
             rtlsim_model.set_metadata_prop(
-                "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs)
+                "extra_verilator_args", str(["-CFLAGS", "-O3"])
             )
-        rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
-        rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
-        rtlsim_latency = rtlsim_perf_dict["cycles"]
-        rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+            rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
+            rtlsim_latency = rtlsim_perf_dict["cycles"]
+            rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+        else:
+            rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
+            # keep keys consistent between the Python and C++-styles
+            cycles = rtlsim_perf_dict["cycles"]
+            clk_ns = float(model.get_metadata_prop("clk_ns"))
+            fclk_mhz = 1 / (clk_ns * 0.001)
+            runtime_s = (cycles * clk_ns) * (10**-9)
+            rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000
+            rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s
+            rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz
+            for (key, val) in rtlsim_perf_dict.items():
+                if "max_count" in key:
+                    del rtlsim_perf_dict[key]
+
         with open(report_dir + "/rtlsim_performance.json", "w") as f:
             json.dump(rtlsim_perf_dict, f, indent=2)
         if cfg.verify_save_rtlsim_waveforms:
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 2c7c86c64ea1279cb18cf8342aa20fb2792bdaf5..56d4230a3af3057daaa5c47140fcde1590dee686 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -36,9 +36,14 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
 from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
     ConvolutionInputGenerator1D,
 )
+from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import (
+    ConvolutionInputGenerator_rtl,
+)
 from finn.custom_op.fpgadataflow.downsampler import DownSampler
 from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
+from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise
 from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
+from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
 from finn.custom_op.fpgadataflow.iodma import IODMA
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
@@ -67,6 +72,7 @@ custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
 custom_op["MatrixVectorActivation"] = MatrixVectorActivation
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
+custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
 custom_op["TLastMarker"] = TLastMarker
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
@@ -85,3 +91,5 @@ custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch
 custom_op["Lookup"] = Lookup
 custom_op["StreamingConcat"] = StreamingConcat
 custom_op["CheckSum"] = CheckSum
+custom_op["StreamingEltwise"] = StreamingEltwise
+custom_op["FMPadding_rtl"] = FMPadding_rtl
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
index 13a4c5892c8f82c37e1794057a06217981a6a580..cd0af6b3ab3d8250abbf7d48e004622e55f09f04 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams_batch.py
@@ -42,18 +42,21 @@ class AddStreams_Batch(HLSCustomOp):
         super().__init__(onnx_node)
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            "NumChannels": ("i", True, ""),
-            "PE": ("i", True, ""),
-            # FINN DataTypes for inputs; output datatype inferred from input
-            "inputDataType": ("s", True, ""),
-            # number of input vectors, examples:
-            # [1] is a single vector (like a FC layer with batch=1)
-            # [4] is four vectors (like a FC layer with batch=4)
-            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
-            "numInputVectors": ("ints", False, [1]),
-        }
-        my_attrs.update(super().get_nodeattr_types())
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumChannels": ("i", True, ""),
+                "PE": ("i", True, ""),
+                # FINN DataTypes for inputs; output datatype inferred from input
+                "inputDataType": ("s", True, ""),
+                # number of input vectors, examples:
+                # [1] is a single vector (like a FC layer with batch=1)
+                # [4] is four vectors (like a FC layer with batch=4)
+                # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+                "numInputVectors": ("ints", False, [1]),
+                "inFIFODepths": ("ints", False, [2, 2]),
+            }
+        )
         return my_attrs
 
     def get_normal_input_shape(self, ind=0):
@@ -70,10 +73,10 @@ class AddStreams_Batch(HLSCustomOp):
         ishape = tuple(vecs + [ich // pe, pe])
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_folded_input_shape()
 
     def make_shape_compatible_op(self, model):
@@ -124,11 +127,11 @@ class AddStreams_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # we need to set output datatype to the next larger int or uint
         # enhancement: consider specifying w/ explicit outputDataType attribute
@@ -139,14 +142,14 @@ class AddStreams_Batch(HLSCustomOp):
         else:
             return DataType.get_smallest_possible(2 * idt.max())
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
@@ -357,3 +360,14 @@ class AddStreams_Batch(HLSCustomOp):
         swidth = self.get_instream_width_padded()
         intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
         return intf_names
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+                "in1": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
index 3ed76db2982e411b711be5bd78e39dd866332714..46adca680d3c96695eeb5a91be53ea158fc78f1f 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
@@ -102,9 +102,6 @@ class ChannelwiseOp_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             "paramDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -221,23 +218,23 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         # total cost
         return comparator_cost + lutram_cost
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         fold = ich // pe
@@ -245,17 +242,17 @@ class ChannelwiseOp_Batch(HLSCustomOp):
         folded_input_shape = tuple(vecs + [fold, pe])
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # same shape as input
         return self.get_folded_input_shape()
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [ich])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/checksum.py
index bde285eb0dd1b3818926c1feb7ac8d5de69a4be6..c927c07df21faf40ccbf9ddbe47e3f2f2ca61c89 100644
--- a/src/finn/custom_op/fpgadataflow/checksum.py
+++ b/src/finn/custom_op/fpgadataflow/checksum.py
@@ -77,31 +77,31 @@ class CheckSum(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # here same as input data type
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("inputDataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         return self.get_instream_width()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         # derive normal shape from folded shape
         # checksum nodes are inserted in between fpgadataflow nodes
         # the folded shape could be for example (1, nf, pe)
@@ -127,7 +127,7 @@ class CheckSum(HLSCustomOp):
     def get_ap_int_max_w(self):
         return max(super().get_ap_int_max_w(), 32)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
index 5fcf9cf96cbacd4e444af0b90618a19eefb9bfe2..4437bcd1984c5194b0a19b43d692babb7e3cd158 100644
--- a/src/finn/custom_op/fpgadataflow/concat.py
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -74,12 +74,12 @@ class StreamingConcat(HLSCustomOp):
     def get_folded_input_shape(self, ind=0):
         return self.get_normal_input_shape(ind)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         total_elems = self.get_total_elems()
         vecs = list(self.get_nodeattr("numInputVectors"))
         return tuple(vecs + [total_elems])
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_normal_output_shape()
 
     def make_shape_compatible_op(self, model):
@@ -106,7 +106,7 @@ class StreamingConcat(HLSCustomOp):
         # input dt identical for all inputs
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         return self.get_input_datatype()
 
     def get_instream_width(self, ind=0):
@@ -115,7 +115,7 @@ class StreamingConcat(HLSCustomOp):
         ibits = self.get_input_datatype().bitwidth()
         return elems * ibits
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         total_elems = self.get_total_elems()
         out_width = total_elems * obits
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 251a9882c58a3cf94449701795b72c8a6adab318..1566445999a2c568b5c5a112d436bf05fd89aca5 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -99,13 +99,13 @@ class ConvolutionInputGenerator(HLSCustomOp):
             assert ret[0] == ret[1] == 1, "Only dilation=1 supported"
         return ret
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         simd = self.get_nodeattr("SIMD")
@@ -114,7 +114,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -126,7 +126,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -158,15 +158,15 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
@@ -176,7 +176,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         in_width = simd * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns stream width, input and output stream width are equal for
         the sliding window function, so the function to determine the input
         stream width can be reused."""
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index aba74baecc0f40571fa288459a04ad42e167ccf6..f1c84662cc06e89df5bd7c0762ac47b8c5723502 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -91,13 +91,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
         simd = self.get_nodeattr("SIMD")
@@ -106,7 +106,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -118,7 +118,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("ConvKernelDim")
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -153,15 +153,15 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         ifm_ch = self.get_nodeattr("IFMChannels")
@@ -169,7 +169,7 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         in_width = simd * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         if self.use_parallel_window_output():
             # feed all window pixels in parallel
             k_h, k_w = self.get_nodeattr("ConvKernelDim")
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
new file mode 100755
index 0000000000000000000000000000000000000000..1afd23d3a1709a8929a03c21a6eba0a5a8cd6ba6
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py
@@ -0,0 +1,930 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.general import im2col
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+# RTL Convolution Input Generator / Sliding Window Generator (SWG)
+# Matches and extends the functionality of all ConvolutionInputGenerator_* functions
+# in finn-hlslib by generating HDL code for two different implementation styles:
+# - Addressable cyclic buffer: to be used when out_width <= in_width
+# - Parallel registers + line buffers: to be used when out_width > in_width
+# Supports non-square, 1D, strided, dilated, and depthwise convolutions.
+# Note: the actual data layout produced is different for depthwise and non-depthwise:
+# * non-depthwise SWG: (1, OFMDim_H, OFMDim_W, K_H, K_W, IFMChannels/SIMD, SIMD)
+# * depthwise SWG: (1, OFMDim_H, OFMDim_W, IFMChannels/SIMD, K_H, K_W, SIMD)
+
+# NOTE: "Parallel" implementation style not yet implemented in this version!
+
+
+class ConvolutionInputGenerator_rtl(HLSCustomOp):
+    """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants. Generates an RTL ConvolutionInputGenerator
+    implementation based on (System-)Verilog templates, defined in finn-rtllib/swg."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "ConvKernelDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "IFMChannels": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "OFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "SIMD": ("i", True, 0),
+            # additional parallelization parameter - not yet implemented
+            "M": ("i", False, 1),
+            # alternative implementation style - not yet implemented
+            "parallel_window": ("i", False, 0, {0}),
+            "Stride": ("ints", True, []),  # [H, W] = [Y, X]
+            "Dilation": ("ints", True, []),  # [H, W] = [Y, X]
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            "depthwise": ("i", False, 0, {0, 1}),
+            # Enable reprogrammable implementation to change FM dimensions,
+            # stride, or dilation during runtime
+            "dynamic_mode": ("i", False, 0, {0, 1}),
+            # FPGA resource type for ConvolutionInputGenerator input buffer
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        wf = int(ifm_ch / simd)
+        folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
+        return folded_ishape
+
+    def get_normal_output_shape(self, ind=0):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
+        return oshape
+
+    def get_folded_output_shape(self, ind=0):
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        dilation_h, dilation_w = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+        pad = 0
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        if self.get_nodeattr("parallel_window"):
+            wf = int((ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+        else:
+            wf = int((k_h * k_w * ifm_ch) // simd)
+            folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+        return folded_oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        # data type stays the same
+        dtype = model.get_tensor_datatype(node.input[0])
+        model.set_tensor_datatype(node.output[0], dtype)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+        in_width = simd * ibits
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        if self.get_nodeattr("parallel_window"):
+            # feed all window pixels in parallel
+            k_h, k_w = self.get_nodeattr("ConvKernelDim")
+            return self.get_instream_width() * k_h * k_w
+        else:
+            # if parallel variant not in use: same width for output and input stream
+            return self.get_instream_width()
+
+    def get_number_input_values(self):
+        folded_ishape = self.get_folded_input_shape()
+        num_input_elems = np.prod(folded_ishape[:-1])
+        return num_input_elems
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        num_output_elems = np.prod(folded_oshape[:-1])
+        return num_output_elems
+
+    def get_1d_conv_attrs_normalized(self):
+        # normalize FM dimensions so that:
+        # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
+        # The dummy ('1') dimension is the Y-dimension.
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+
+        if ifm_dim[1] == 1:
+            ifm_dim = ifm_dim[::-1]
+            ofm_dim = ofm_dim[::-1]
+            k = k[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
+
+    def get_buffer_depth(self):
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        impl_style = self.select_impl_style()
+        if impl_style == "default":
+            # compute minimal buffer length (assuming it holds 1 complete window)
+            buffer_min_size = (
+                (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+            ) * channel_factor
+
+            # add additional buffer space in case of stride > 1
+            # this minimizes cycle count as it allows an earlier pre-load of inputs
+            buffer_depth = (
+                buffer_min_size
+                + max(
+                    0,
+                    ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in)))
+                    * channel_factor,
+                )
+                + max(
+                    0,
+                    ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in)))
+                    * channel_factor,
+                )
+            )
+        else:
+            buffer_depth = 0
+            raise Exception("Requested impl. style not implemented")
+        return buffer_depth
+
+    def get_exp_cycles(self):
+        simd = self.get_nodeattr("SIMD")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        ofm_dim = self.get_nodeattr("OFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h, ofm_dim_w = ofm_dim
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        channel_factor = int(ifm_ch / simd)
+
+        if ifm_dim_h == 1 or ifm_dim_w == 1:
+            # 1D case
+            (
+                ifm_ch,
+                [ifm_dim_h, ifm_dim_w],
+                [ofm_dim_h, ofm_dim_w],
+                [k_h, k_w],
+                [stride_h, stride_w],
+                [dilation_h, dilation_w],
+            ) = self.get_1d_conv_attrs_normalized()
+
+            if depthwise:
+                exp_cycles = (
+                    +ofm_dim_w * k_w * channel_factor
+                    + channel_factor * (k_w - 1) * (stride_w - 1)
+                    - (k_w - 1)
+                    + 2
+                )
+            else:
+                exp_cycles = ofm_dim_w * k_w * channel_factor + 2
+        else:
+            # 2D case
+            buffer_min_size = (
+                (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1
+            ) * channel_factor
+            cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor
+            cycles_read_block = stride_w * ifm_dim_w * channel_factor
+            max_cycles = max(cycles_write_block, cycles_read_block)
+            if depthwise:
+                max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1)
+            exp_cycles = buffer_min_size + ofm_dim_h * max_cycles  # initial buffering
+            if depthwise:
+                exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor
+
+        return int(exp_cycles)
+
+    def bram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+
+        # NOTE: Actual BRAM usage might be lower in some cases.
+        # This does not account for the exact Vivado behavior yet.
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.get_buffer_depth()
+        if ram_style == "block" or ram_style == "auto":
+            if buffer_depth <= 512:
+                ram_width = 36
+            elif buffer_depth <= 1024:
+                ram_width = 18
+            elif buffer_depth <= 2048:
+                ram_width = 9
+            elif buffer_depth <= 4096:
+                ram_width = 4
+            elif buffer_depth <= 8192:
+                ram_width = 2
+            else:
+                ram_width = 1
+
+            ram_cascade_depth = math.ceil(buffer_depth / 16384)
+            ram_cascade_width = math.ceil(buffer_width / ram_width)
+            cascade_savings = 0
+            if buffer_depth > 16384:
+                remainder_depth = buffer_depth % 16384
+                if remainder_depth <= 512:
+                    remainder_width = 36
+                elif remainder_depth <= 1024:
+                    remainder_width = 18
+                elif remainder_depth <= 2048:
+                    remainder_width = 9
+                elif remainder_depth <= 4096:
+                    remainder_width = 4
+                elif remainder_depth <= 8192:
+                    remainder_width = 2
+                else:
+                    remainder_width = 1
+
+                remainder_cascade_width = math.ceil(buffer_width / remainder_width)
+                cascade_savings = ram_cascade_width - remainder_cascade_width
+
+            return int(ram_cascade_depth * ram_cascade_width - cascade_savings)
+        else:
+            return 0
+
+    def lut_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.get_buffer_depth()
+        if ram_style == "distributed":
+            ram_luts = int(buffer_width * math.ceil(buffer_depth / 38))
+        else:
+            ram_luts = 0
+        return 300 + ram_luts
+
+    def uram_estimation(self):
+        simd = self.get_nodeattr("SIMD")
+        ram_style = self.get_nodeattr("ram_style")
+        buffer_width = simd * self.get_input_datatype().bitwidth()
+        buffer_depth = self.get_buffer_depth()
+
+        if ram_style == "ultra":
+            ram_depth = 4096
+            ram_width = 72
+            ram_cascade_depth = math.ceil(buffer_depth / ram_depth)
+            ram_cascade_width = math.ceil(buffer_width / ram_width)
+            return int(ram_cascade_depth * ram_cascade_width)
+        else:
+            return 0
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for RTL SWG, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
+            # store bipolar activations as binary
+            inp = (inp + 1) / 2
+            export_idt = DataType["BINARY"]
+        else:
+            export_idt = self.get_input_datatype()
+
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        rtlsim_inp = npy_to_rtlsim_input(
+            "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+        )
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+        odt = export_idt
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+        rtlsim_output_to_npy(
+            rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+        )
+        # load and reshape output
+        output = np.load(out_npy_path)
+        output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+        context[node.output[0]] = output
+
+        # binary -> bipolar if needed
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
+            out = context[node.output[0]]
+            out = 2 * out - 1
+            context[node.output[0]] = out
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output
+        shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
+
+    def prepare_codegen_default(self):
+        # Default implementation style for MMV_out = 1: addressable cyclic buffer
+        # Computing incremental addressing scheme directly..
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = (
+            (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1
+        ) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, wrap logic doesn't account for this"
+        assert not (
+            abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, wrap logic doesn't account for this"
+
+        # set certain threshold indices to detect when reading/writing finishes
+        code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)]
+        code_gen_dict["$LAST_WRITE_ELEM$"] = [
+            str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1)
+        ]
+
+        # default controller loop structure: # iterations (counters) map directly
+        loop_h_iterations = out_dim_h
+        loop_w_iterations = out_dim_w
+        loop_kh_iterations = k_h
+        loop_kw_iterations = k_w
+        loop_simd_iterations = channel_factor
+
+        if depthwise and channel_factor > 1:
+            # re-arrange existing controller loop structure for depthwise convolutions
+            loop_kh_iterations = channel_factor
+            loop_kw_iterations = k_h
+            loop_simd_iterations = k_w
+            addr_incr_end_simd_ = addr_incr_end_simd
+            addr_incr_end_simd = addr_incr_end_window_elem
+            addr_incr_end_window_elem = addr_incr_end_window_row
+            addr_incr_end_window_row = addr_incr_end_simd_
+            elem_per_window = k_h * k_w
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - channel_factor
+            tail_incr_h = addr_incr_end_row + buffer_min_size - channel_factor
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$IS_DEPTHWISE$"] = ["1"]
+        else:
+            # depthwise output format is equivalent to non-depthwise if SIMD=C
+            elem_per_window = k_h * k_w * channel_factor
+
+            tail_incr_w = addr_incr_end_window + buffer_min_size - 1
+            tail_incr_h = addr_incr_end_row + buffer_min_size - 1
+            tail_incr_last_window = buffer_min_size - 1
+            code_gen_dict["$IS_DEPTHWISE$"] = ["0"]
+
+        # support SIMD = IFMChannels and k_w = 1 cases
+        # for k = [k_h, k_w] = [1, k_w], no adjustment is needed
+        # for k = [k_h, k_w] = [1, 1], do not use this impl. style (mmv_out=K=1)
+        # innermost loop is executed at least once -> adjust if needed
+        if loop_simd_iterations == 1:
+            # skip innermost SIMD loop completely
+            if loop_kw_iterations == 1:
+                # skip innermost KW loop completely
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"]
+                loop_kh_iterations -= 1  # -1 because state is initial state
+            else:
+                code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KW"]
+                loop_kw_iterations -= 1  # -1 because state is initial state
+        else:
+            code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_SIMD"]
+            loop_simd_iterations -= 1  # -1 because state is initial state
+
+        cntr_bitwidth = math.ceil(
+            math.log2(
+                max(
+                    loop_h_iterations - 2 + 1,
+                    loop_w_iterations - 2 + 1,
+                    loop_kh_iterations - 2 + 1,
+                    loop_kw_iterations - 2 + 1,
+                    loop_simd_iterations - 2 + 1,
+                )
+            )
+        )
+        code_gen_dict["$CNTR_BITWIDTH$"] = [str(cntr_bitwidth)]
+        code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 2)]
+        code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 2)]
+        code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 2)]
+        code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 2)]
+        code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 2)]
+
+        incr_bitwidth = 1 + math.ceil(
+            math.log2(
+                max(
+                    abs(addr_incr_end_simd) + 1,
+                    abs(addr_incr_end_window_elem) + 1,
+                    abs(addr_incr_end_window_row) + 1,
+                    abs(addr_incr_end_window) + 1,
+                    abs(addr_incr_end_row) + 1,
+                    abs(tail_incr_w) + 1,
+                    abs(tail_incr_h) + 1,
+                    abs(tail_incr_last_window) + 1,
+                )
+            )
+        )
+        code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)]
+        code_gen_dict["$HEAD_INCR_SIMD$"] = [str(addr_incr_end_simd)]
+        code_gen_dict["$HEAD_INCR_KW$"] = [str(addr_incr_end_window_elem)]
+        code_gen_dict["$HEAD_INCR_KH$"] = [str(addr_incr_end_window_row)]
+        code_gen_dict["$HEAD_INCR_W$"] = [str(addr_incr_end_window)]
+        code_gen_dict["$HEAD_INCR_H$"] = [str(addr_incr_end_row)]
+        code_gen_dict["$TAIL_INCR_W$"] = [str(tail_incr_w)]
+        code_gen_dict["$TAIL_INCR_H$"] = [str(tail_incr_h)]
+        code_gen_dict["$TAIL_INCR_LAST$"] = [str(tail_incr_last_window)]
+
+        code_gen_dict["$ELEM_PER_WINDOW$"] = [str(elem_per_window)]
+        code_gen_dict["$SIMD$"] = [str(simd)]
+        code_gen_dict["$MMV_IN$"] = [str(mmv_in)]
+        code_gen_dict["$MMV_OUT$"] = [str(mmv_out)]
+
+        return template_path, code_gen_dict
+
+    def select_impl_style(self):
+        simd = self.get_nodeattr("SIMD")
+        M = self.get_nodeattr("M")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        k_h, k_w = k
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        # check for valid configuration
+        assert (
+            kernel_height <= ifm_dim_h
+            and kernel_width <= ifm_dim_w
+            and stride_h <= ifm_dim_h
+            and stride_w <= ifm_dim_w
+        ), "Illegal conv configuration: kernel or stride > FM dimension"
+
+        # init folding config
+        if self.get_nodeattr("parallel_window"):
+            # mmv_in = M * 1
+            mmv_out = M * k_h * k_w
+            assert (
+                ifm_ch == simd
+            ), "Constraint violated: SIMD must be equal to IFMChannels"
+        else:
+            # mmv_in = 1
+            mmv_out = 1
+            assert (
+                ifm_ch % simd == 0
+            ), "Constraint violated: SIMD must divide IFMChannels"
+
+        # choose implementation style
+        if mmv_out > 1 or (k_h == 1 and k_w == 1):
+            impl_style = "parallel"
+            assert (
+                ifm_ch == simd
+            ), "Constraint violated: SIMD must be equal to IFMChannels"
+        else:
+            impl_style = "default"
+
+        assert (
+            impl_style == "default"
+        ), "ERROR: Parallel window mode not yet implemented"
+        return impl_style
+
+    def generate_hdl(self):
+        impl_style = self.select_impl_style()
+
+        # prepare code generation by filling out dictionaries
+        if impl_style == "default":
+            template_path, code_gen_dict = self.prepare_codegen_default()
+        else:
+            raise Exception("Requested impl. style not implemented")
+
+        # add general parameters to dictionary
+        code_gen_dict["$TOP_MODULE_NAME$"] = [self.get_verilog_top_module_name()]
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+        code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())]
+        ram_style = self.get_nodeattr("ram_style")
+        if ram_style == "auto":
+            code_gen_dict["$RAM_STYLE$"] = [""]
+        else:
+            code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)]
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_wrapper_dynamic.v"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_wrapper.v"
+        with open(os.environ["FINN_ROOT"] + template_select, "r") as f:
+            template_wrapper = f.read()
+        with open(
+            os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_axilite.v", "r"
+        ) as f:
+            template_axilite = f.read()
+        for key in code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+            template_wrapper = template_wrapper.replace(key, code_gen_line)
+            template_axilite = template_axilite.replace(key, code_gen_line)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"
+            ),
+            "w",
+        ) as f:
+            f.write(template)
+        with open(
+            os.path.join(
+                code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"
+            ),
+            "w",
+        ) as f:
+            f.write(template_wrapper)
+
+        # AXI-Lite reg. file component is only needed for dynamic mode
+        if self.get_nodeattr("dynamic_mode"):
+            with open(
+                os.path.join(
+                    code_gen_dir, self.get_nodeattr("gen_top_module") + "_axilite.v"
+                ),
+                "w",
+            ) as f:
+                f.write(template_axilite)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        # Modified to use generated (System-)Verilog instead of HLS output products
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            self.get_nodeattr("gen_top_module") + "_wrapper.v",
+            self.get_nodeattr("gen_top_module") + "_impl.sv",
+        ]
+        if self.get_nodeattr("dynamic_mode"):
+            verilog_files.append(self.get_nodeattr("gen_top_module") + "_axilite.v")
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        sourcefiles = [
+            self.get_nodeattr("gen_top_module") + "_wrapper.v",
+            self.get_nodeattr("gen_top_module") + "_impl.sv",
+        ]
+
+        if self.get_nodeattr("dynamic_mode"):
+            sourcefiles += [self.get_nodeattr("gen_top_module") + "_axilite.v"]
+
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % (f)]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+        return cmd
+
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HLSCustomOp implementation to add axilite control IF
+        """Return a dict of names of input and output interfaces.
+        The keys reflect the protocols each interface implements:
+        'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'.
+        Values are lists of tuples (axis, aximm) or names (axilite):
+        'axis' tuples correspond to the list of node inputs in order,
+        each tuple is (interface_name, interface_width_bits).
+        axilite always assumed to be 32 bits and is not tuple (name only).
+        Each block must have at most one aximm and one axilite."""
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("dynamic_mode"):
+            intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def get_dynamic_config(self, ifm_dim=None, stride=None, dilation=None):
+        """Returns a configuration dict to re-configure FM dimension during
+        runtime. Stride and dilation can also be changed. Certain restrictions
+        apply (e.g. component must be synthesized for largest buffer size)."""
+        # NOTE: For better driver integration, this functionality could be packaged
+        # as a standalone function in the future
+
+        if ifm_dim is None:
+            ifm_dim = self.get_nodeattr("IFMDim")
+        k = self.get_nodeattr("ConvKernelDim")
+        if stride is None:
+            stride = self.get_nodeattr("Stride")
+        if dilation is None:
+            dilation = self.get_nodeattr("Dilation")
+
+        k_h, k_w = k
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        # update attributes and perform sanity check
+        original_buffer_depth = self.get_buffer_depth()
+        self.set_nodeattr("IFMDim", ifm_dim)
+        self.set_nodeattr("OFMDim", ofm_dim)
+        self.set_nodeattr("Stride", stride)
+        self.set_nodeattr("Dilation", dilation)
+        assert (
+            self.get_buffer_depth() <= original_buffer_depth
+        ), """Error: requested
+            dynamic configuration does not fit in generated buffer implementation."""
+
+        # (re-)call codegen and extract new values
+        # each setting is mapped to an axi-lite register address
+        template_path, code_gen_dict = self.prepare_codegen_default()
+        config = {
+            "cfg_wren": (0 * 4, 1),
+            "cfg_cntr_simd": (1 * 4, int(code_gen_dict["$LOOP_SIMD_ITERATIONS$"][0])),
+            "cfg_cntr_kw": (2 * 4, int(code_gen_dict["$LOOP_KW_ITERATIONS$"][0])),
+            "cfg_cntr_kh": (3 * 4, int(code_gen_dict["$LOOP_KH_ITERATIONS$"][0])),
+            "cfg_cntr_w": (4 * 4, int(code_gen_dict["$LOOP_W_ITERATIONS$"][0])),
+            "cfg_cntr_h": (5 * 4, int(code_gen_dict["$LOOP_H_ITERATIONS$"][0])),
+            "cfg_incr_head_simd": (6 * 4, int(code_gen_dict["$HEAD_INCR_SIMD$"][0])),
+            "cfg_incr_head_kw": (7 * 4, int(code_gen_dict["$HEAD_INCR_KW$"][0])),
+            "cfg_incr_head_kh": (8 * 4, int(code_gen_dict["$HEAD_INCR_KH$"][0])),
+            "cfg_incr_head_w": (9 * 4, int(code_gen_dict["$HEAD_INCR_W$"][0])),
+            "cfg_incr_head_h": (10 * 4, int(code_gen_dict["$HEAD_INCR_H$"][0])),
+            "cfg_incr_tail_w": (11 * 4, int(code_gen_dict["$TAIL_INCR_W$"][0])),
+            "cfg_incr_tail_h": (12 * 4, int(code_gen_dict["$TAIL_INCR_H$"][0])),
+            "cfg_incr_tail_last": (13 * 4, int(code_gen_dict["$TAIL_INCR_LAST$"][0])),
+            "cfg_last_read": (14 * 4, int(code_gen_dict["$LAST_READ_ELEM$"][0])),
+            "cfg_last_write": (15 * 4, int(code_gen_dict["$LAST_WRITE_ELEM$"][0])),
+        }
+        return config
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index da29a524b6bba7ce0c7a71bc64a44ae128d91709..b7efaff440dd5cc2160fbfb8050b30924460ffe6 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -36,7 +36,7 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
 class DownSampler(HLSCustomOp):
-    """Corresponds to finn-hlslib ConvolutionInputGenerator_kernel1 function.
+    """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function.
     Basically performs a down sampling of the image removing rows and columns."""
 
     def __init__(self, onnx_node):
@@ -55,6 +55,10 @@ class DownSampler(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             # Batch size
             "numInputVectors": ("i", False, 1),
+            # 1D (True) or 2D (False) spatial data
+            "is1D": ("i", False, 0),
+            # for 1D only: (D, 1) (True) or (1, D) dims
+            "is1D_unitx": ("i", False, 1),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -66,28 +70,46 @@ class DownSampler(HLSCustomOp):
         return int(np.floor((idim - 1) / stride) + 1)
 
     def get_exp_cycles(self):
+        is_1D = self.get_nodeattr("is1D")
         idim = self.get_nodeattr("ImgDim")
+        idim_total = idim if is_1D else idim * idim
         channels = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = channels / simd * batch_size * idim * idim
+        exp_cycles = channels / simd * batch_size * idim_total
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
+        is_1D = self.get_nodeattr("is1D")
+        is_1D_unitx = self.get_nodeattr("is1D_unitx")
         idim = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        ishape = (batch, idim, idim, num_ch)
+        if is_1D:
+            if is_1D_unitx:
+                ishape = (batch, idim, 1, num_ch)
+            else:
+                ishape = (batch, 1, idim, num_ch)
+        else:
+            ishape = (batch, idim, idim, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
+        is_1D = self.get_nodeattr("is1D")
+        is_1D_unitx = self.get_nodeattr("is1D_unitx")
         odim = self.get_downsampled_odim()
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        oshape = (batch, odim, odim, num_ch)
+        if is_1D:
+            if is_1D_unitx:
+                oshape = (batch, odim, 1, num_ch)
+            else:
+                oshape = (batch, 1, odim, num_ch)
+        else:
+            oshape = (batch, odim, odim, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -96,7 +118,7 @@ class DownSampler(HLSCustomOp):
         folded_ishape = normal_ishape[:-1] + [fold, simd]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -129,21 +151,21 @@ class DownSampler(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return ibits * simd
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return obits * simd
@@ -204,8 +226,9 @@ class DownSampler(HLSCustomOp):
         )
 
     def docompute(self):
+        dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D"
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """ConvolutionInputGenerator_kernel1<IFMChannels, Input_precision,
+            f"""ConvolutionInputGenerator_{dim_var}_kernel1<IFMChannels, Input_precision,
             IFMDim, SIMD,Stride> (in0, out, numReps);"""
         ]
 
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 04ca45e7f1c1844a9976d46392be46f6cffc2167..93cde15ca7d42dbed12417837916359fdcc71b67 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -61,13 +61,13 @@ class DuplicateStreams_Batch(HLSCustomOp):
     def get_num_output_streams(self):
         return self.get_nodeattr("NumOutputStreams")
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [ch])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -138,22 +138,22 @@ class DuplicateStreams_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
@@ -408,3 +408,13 @@ class DuplicateStreams_Batch(HLSCustomOp):
                 ("out%d_%s" % (i, sname), self.get_outstream_width_padded())
             )
         return intf_names
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out0": [], "out1": []},
+        }
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ed6546c741277bd8e962b6e80eda083cedba9c
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/eltwise.py
@@ -0,0 +1,466 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingEltwise(HLSCustomOp):
+    """Class that corresponds to finn-hlslib StreamingEltwise function."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumChannels": ("i", True, ""),
+                "PE": ("i", True, ""),
+                # FINN DataTypes for inputs; output datatype inferred from input
+                "inputDataType0": ("s", True, ""),
+                "inputDataType1": ("s", True, ""),
+                # type of EltwiseFunction for the operation
+                "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]),
+                # number of input vectors, examples:
+                # [1] is a single vector (like a FC layer with batch=1)
+                # [4] is four vectors (like a FC layer with batch=4)
+                # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+                "numInputVectors": ("ints", False, [1]),
+                "inFIFODepths": ("ints", False, [2, 2]),
+            }
+        )
+        return my_attrs
+
+    def get_eltwise_op_lambda(self):
+        eltwise_op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        odt = self.get_output_datatype()
+        tin0 = idt0.get_hls_datatype_str()
+        tin1 = idt1.get_hls_datatype_str()
+        tout = odt.get_hls_datatype_str()
+        eltwise_ops = {
+            # "Add": "[](auto a, auto b) { return  a + b; }",
+            # "Sub": "[](auto a, auto b) { return  a - b; }",
+            # "AbsDiff": "[](auto a, auto b) { return  a>b? a-b : b-a; }",
+            "Add": f"add<{tin0}, {tin1}, {tout}>()",
+            "Sub": f"sub<{tin0}, {tin1}, {tout}>()",
+            "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()",
+        }
+        return eltwise_ops[eltwise_op]
+
+    def get_normal_input_shape(self, ind=0):
+        ich = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich])
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        ich = self.get_nodeattr("NumChannels")
+        pe = self.get_nodeattr("PE")
+        assert ich % pe == 0, "PE must divide NumChannels"
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [ich // pe, pe])
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        return self.get_normal_input_shape()
+
+    def get_folded_output_shape(self, ind=0):
+        return self.get_folded_input_shape()
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input1 shape."
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
+        assert ishape == exp_ishape, "Unexpected input2 shape."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt0 = model.get_tensor_datatype(node.input[0])
+        if idt0 != self.get_input_datatype(0):
+            warn_str = "inputDataType0 changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype(0)),
+                str(idt0),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType0", idt0.name)
+        idt1 = model.get_tensor_datatype(node.input[1])
+        if idt1 != self.get_input_datatype(1):
+            warn_str = "inputDataType1 changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype(1)),
+                str(idt1),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType1", idt1.name)
+        # enforce output data type (calculated based on idt)
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+        # verify that "backend" is set to "fpgadataflow"
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        # verify that all necessary attributes exist
+        try:
+            self.get_nodeattr("code_gen_dir_cppsim")
+            self.get_nodeattr("executable_path")
+            self.get_nodeattr("NumChannels")
+            self.get_nodeattr("PE")
+            self.get_nodeattr("inputDataType0")
+            self.get_nodeattr("inputDataType1")
+            self.get_nodeattr("eltwiseOp")
+            info_messages.append("All necessary attributes exist")
+        except Exception:
+            info_messages.append(
+                """The required StreamingEltwise attributes do not exist."""
+            )
+
+        return info_messages
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType" + str(ind))]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        assert idt0.signed() == idt1.signed(), (
+            "%s: Inputs must have same signedness" % self.onnx_node.name
+        )
+        idt0_min, idt0_max = idt0.min(), idt0.max()
+        idt1_min, idt1_max = idt1.min(), idt1.max()
+        cands = [
+            idt0_min - idt1_min,
+            idt0_min - idt1_max,
+            idt0_max - idt1_min,
+            idt0_max - idt1_max,
+        ]
+        largest_magnitude = max(map(abs, cands))
+        if op == "Add":
+            if idt0.signed():
+                return DataType.get_smallest_possible(idt0.min() + idt1.min())
+            else:
+                return DataType.get_smallest_possible(idt0.max() + idt1.max())
+        elif op == "Sub":
+            return DataType.get_smallest_possible(-largest_magnitude)
+        elif op == "AbsDiff":
+            return DataType.get_smallest_possible(largest_magnitude)
+        else:
+            raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op))
+
+    def get_instream_width(self, ind=0):
+        """Returns input stream width."""
+        ibits = self.get_input_datatype(ind).bitwidth()
+        pe = self.get_nodeattr("PE")
+        in_width = pe * ibits
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        """Returns output stream width."""
+        obits = self.get_output_datatype().bitwidth()
+        pe = self.get_nodeattr("PE")
+        out_width = pe * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def get_exp_cycles(self):
+        # Channels/PE * batch size * fmdim * fmdim
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input0 shape doesn't match expected shape ."""
+        export_idt0 = self.get_input_datatype(0)
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        # exact same thing for input1
+        inp = context[node.input[1]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input1 shape doesn't match expected shape ."""
+        export_idt1 = self.get_input_datatype(1)
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == exp_oshape
+            ), "cppsim did not produce expected output shape"
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits0 = self.get_instream_width(0)
+            nbits1 = self.get_instream_width(1)
+            rtlsim_inp0 = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0
+            )
+            rtlsim_inp1 = npy_to_rtlsim_input(
+                "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = [
+            '#include "eltwise.hpp"',
+            '#include "interpret.hpp"',
+        ]
+
+        self.code_gen_dict["$GLOBALS$"].extend(
+            [
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct absdiff {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a>b? a-b : b-a;",
+                "}",
+                "};",
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct sub {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a-b;",
+                "}",
+                "};",
+                "template<typename TI1, typename TI2, typename TO>",
+                "struct add {",
+                "TO operator()(TI1 const &a, TI2 const &b) const {",
+                "#pragma HLS inline",
+                "return  a+b;",
+                "}",
+                "};",
+            ]
+        )
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = []
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        elem_bits_0 = idt0.bitwidth()
+        elem_bits_1 = idt1.bitwidth()
+        packed_bits_0 = self.get_instream_width(0)
+        packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0
+        packed_bits_1 = self.get_instream_width(1)
+        packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1
+        elem_hls_type_0 = idt0.get_hls_datatype_str()
+        elem_hls_type_1 = idt1.get_hls_datatype_str()
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
+            % (packed_hls_type_0, elem_hls_type_0, elem_bits_0, npy_type, npy_in)
+        )
+        npy_in = "%s/input_1.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2apintstream<%s, %s, %d, %s>("%s", in1);'
+            % (packed_hls_type_1, elem_hls_type_1, elem_bits_1, npy_type, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width(0))
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in1 ("in1");'.format(self.get_instream_width(1))
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        op = self.get_nodeattr("eltwiseOp")
+        idt0 = self.get_input_datatype(0)
+        idt1 = self.get_input_datatype(1)
+        odt = self.get_output_datatype()
+        elem_hls_type_0 = idt0.get_hls_datatype_str()
+        elem_hls_type_1 = idt1.get_hls_datatype_str()
+        out_hls_type = odt.get_hls_datatype_str()
+        slice_in0 = "Slice<%s>" % elem_hls_type_0
+        slice_in1 = "Slice<%s>" % elem_hls_type_1
+        slice_out = "Slice<%s>" % out_hls_type
+        eltwise_op_str = self.get_eltwise_op_lambda()
+        "%sEltwiseFunction<%s, %s, %s>()" % (
+            op,
+            elem_hls_type_0,
+            elem_hls_type_1,
+            out_hls_type,
+        )
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """{}<{}, {}, {}, {}, {}, {}>(in0, in1, out, {});""".format(
+                "StreamingEltwise",
+                self.get_nodeattr("NumChannels"),
+                self.get_nodeattr("PE"),
+                int(np.prod(self.get_folded_output_shape()[:-2])),
+                slice_in0,
+                slice_in1,
+                slice_out,
+                eltwise_op_str,
+            )
+        ]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void {}(hls::stream<ap_uint<{}>> &in0, hls::stream<ap_uint<{}>> &in1,
+                hls::stream<ap_uint<{}>> &out)""".format(
+                self.onnx_node.name,
+                self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(),
+                self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(),
+                self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        sname = self.hls_sname()
+        swidth = self.get_instream_width_padded()
+        intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
index d69ea471ea8ae1d58f97d056936b505cc2a2806b..dfc55d283fa664e3b60fc7c4d5a056f53a119292 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
@@ -47,10 +47,6 @@ class FMPadding_Batch(HLSCustomOp):
             # spatial size of input images
             "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
             # total padding (per dimension) to apply
-            # NOTE: Current padding scheme that is applied tries to pad the same
-            # amount of zeros in front and behind the image for each dimension.
-            # As an example, a padding scheme such as [1, x, 3, x] is equal
-            # to [2, x, 2, x]
             "Padding": (
                 "ints",
                 True,
@@ -62,10 +58,6 @@ class FMPadding_Batch(HLSCustomOp):
             "SIMD": ("i", False, 1),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
-            # controls distribution of padded pixels
-            # in case of uneven padding -- see FMPadding fxn
-            # in hlslib
-            "PaddingStyle": ("i", False, 2, {2, 1}),
             # shape describing input vecs per execution
             "numInputVectors": ("i", False, 1),
         }
@@ -90,20 +82,20 @@ class FMPadding_Batch(HLSCustomOp):
         exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         idim_h, idim_w = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
         ishape = (1, idim_h, idim_w, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         odim_h, odim_w = self.get_padded_odim()
         num_ch = self.get_nodeattr("NumChannels")
 
         oshape = (1, odim_h, odim_w, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -112,7 +104,7 @@ class FMPadding_Batch(HLSCustomOp):
         folded_ishape = normal_ishape[:-1] + [fold, simd]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("NumChannels")
         simd = self.get_nodeattr("SIMD")
@@ -144,7 +136,7 @@ class FMPadding_Batch(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         # the hlslib op always pads with zeros, so ensure that the DataType
@@ -152,16 +144,16 @@ class FMPadding_Batch(HLSCustomOp):
         assert ret.allowed(0), "FMPadding_Batch DataType must support zero"
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return ibits * simd
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         simd = self.get_nodeattr("SIMD")
         return obits * simd
@@ -179,23 +171,21 @@ class FMPadding_Batch(HLSCustomOp):
         pad = self.get_nodeattr("Padding")
         pad_h = pad[0] + pad[2]
         pad_w = pad[1] + pad[3]
-        is_square = idim_h == idim_w
+        is_square_img = idim_h == idim_w
+        is_square_pad = pad_h == pad_w
 
-        if is_square:
-            assert (
-                pad_h == pad_w
-            ), "Only equal padding along the dimensions for square images is supported"
+        if is_square_img and is_square_pad:
             self.code_gen_dict["$DEFINES$"] = [
                 """#define ImgDim1 {}\n#define OutputDim1 {}\n
-                #define Padding1 {}\n#define NumChannels1 {}\n
-                #define SIMD1 {}\n#define PaddingStyle1 {}\n
+                #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n
+                #define NumChannels1 {}\n#define SIMD1 {}\n
                 #define numReps {}\n""".format(
                     idim_h,
                     odim_h,
-                    pad_h,
+                    pad[0],
+                    pad[2],
                     self.get_nodeattr("NumChannels"),
                     self.get_nodeattr("SIMD"),
-                    self.get_nodeattr("PaddingStyle"),
                     self.get_nodeattr("numInputVectors"),
                 )
             ]
@@ -204,20 +194,22 @@ class FMPadding_Batch(HLSCustomOp):
                 """
                 #define OutputDim1_x {}\n
                 #define OutputDim1_y {}\n
-                #define Padding1_x {}\n
-                #define Padding1_y {}\n
+                #define PaddingLeft1 {}\n
+                #define PaddingRight1 {}\n
+                #define PaddingTop1 {}\n
+                #define PaddingBottom1 {}\n
                 #define NumChannels1 {}\n
                 #define SIMD1 {}\n
-                #define PaddingStyle1 {}\n
                 #define numReps {}\n
                 """.format(
                     odim_w,
                     odim_h,
-                    pad_w,
-                    pad_h,
+                    pad[1],
+                    pad[3],
+                    pad[0],
+                    pad[2],
                     self.get_nodeattr("NumChannels"),
                     self.get_nodeattr("SIMD"),
-                    self.get_nodeattr("PaddingStyle"),
                     self.get_nodeattr("numInputVectors"),
                 )
             ]
@@ -254,21 +246,26 @@ class FMPadding_Batch(HLSCustomOp):
         node = self.onnx_node
 
         idim_h, idim_w = self.get_nodeattr("ImgDim")
-        is_square = idim_h == idim_w
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        is_square_img = idim_h == idim_w
+        is_square_pad = pad_h == pad_w
 
-        if is_square:
+        if is_square_img and is_square_pad:
             hls_call = node.op_type
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,SIMD1,
-                {}, PaddingStyle1> (in0, out, numReps);""".format(
+                """{}<ImgDim1, OutputDim1, PaddingBefore1, PaddingBehind1, NumChannels1, SIMD1,
+                {}> (in0, out, numReps);""".format(
                     hls_call, in_t
                 )
             ]
         else:
             hls_call = "FMPadding_nonsquare_Batch"
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<OutputDim1_x, OutputDim1_y, Padding1_x, Padding1_y, NumChannels1,
-                SIMD1, {}, PaddingStyle1> (in0, out, numReps);""".format(
+                """{}<OutputDim1_x, OutputDim1_y, PaddingLeft1, PaddingRight1,
+                PaddingTop1, PaddingBottom1, NumChannels1,
+                SIMD1, {}> (in0, out, numReps);""".format(
                     hls_call, in_t
                 )
             ]
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5650d218857a7c7ff86c15ac057c4ebbc18df5ca
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py
@@ -0,0 +1,420 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+import shutil
+import warnings
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class FMPadding_rtl(HLSCustomOp):
+    """CustomOp wrapper for the finn-rtllib fmpadding_axi component
+    Supports adjusting the padding amount and spatial feature sizes at
+    runtime."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # spatial size of input images
+            "ImgDim": ("ints", True, []),  # [H, W] = [Y, X]
+            # total padding (per dimension) to apply
+            "Padding": (
+                "ints",
+                True,
+                [1, 1, 1, 1],
+            ),  # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
+            # number of channels in input image
+            "NumChannels": ("i", True, 0),
+            # SIMD Input parallelism
+            "SIMD": ("i", False, 1),
+            # FINN input datatype
+            "inputDataType": ("s", True, ""),
+            # shape describing input vecs per execution
+            "numInputVectors": ("i", False, 1),
+            # Enable reprogrammable implementation to change FM dimensions,
+            # stride, or dilation during runtime
+            "dynamic_mode": ("i", False, 0, {0, 1}),
+            # attribute to save top module name - not user configurable
+            "gen_top_module": ("s", False, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_padded_odim(self):
+        "Return the padded spatial size of the output."
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        pad = self.get_nodeattr("Padding")
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        odim_h = idim_h + pad_h
+        odim_w = idim_w + pad_w
+        return [odim_h, odim_w]
+
+    def get_exp_cycles(self):
+        odim_h, odim_w = self.get_padded_odim()
+        channels = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
+        return int(exp_cycles)
+
+    def get_normal_input_shape(self, ind=0):
+        idim_h, idim_w = self.get_nodeattr("ImgDim")
+        num_ch = self.get_nodeattr("NumChannels")
+        ishape = (1, idim_h, idim_w, num_ch)
+        return ishape
+
+    def get_normal_output_shape(self, ind=0):
+        odim_h, odim_w = self.get_padded_odim()
+        num_ch = self.get_nodeattr("NumChannels")
+
+        oshape = (1, odim_h, odim_w, num_ch)
+        return oshape
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl."
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        ret = DataType[self.get_nodeattr("inputDataType")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert ret.allowed(0), "FMPadding_rtl DataType must support zero"
+        return ret
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HLSCustomOp implementation to add axilite control IF
+        intf_names = super().get_verilog_top_module_intf_names()
+        if self.get_nodeattr("dynamic_mode"):
+            intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_ishape = self.get_normal_input_shape()
+        exp_oshape = self.get_normal_output_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            raise Exception(
+                "cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim"
+            )
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert (
+            inp.shape == exp_ishape
+        ), """Input shape doesn't
+        match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
+        export_idt = self.get_input_datatype()
+
+        reshaped_input = inp.reshape(folded_ishape)
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        sim = self.get_rtlsim()
+        nbits = self.get_instream_width()
+        rtlsim_inp = npy_to_rtlsim_input(
+            "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+        )
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+        odt = export_idt
+        target_bits = odt.bitwidth()
+        packed_bits = self.get_outstream_width()
+        out_npy_path = "{}/output.npy".format(code_gen_dir)
+        out_shape = self.get_folded_output_shape()
+        rtlsim_output_to_npy(
+            rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+        )
+        # load and reshape output
+        output = np.load(out_npy_path)
+        output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+        context[node.output[0]] = output
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape
+            (1, OutputDim_H, OutputDim_W, NumChannels)."""
+
+    def get_template_values(self, ifm_dims, pads, chans, simd, idt):
+        dimY, dimX = ifm_dims
+        padT, padL, padB, padR = pads
+        y_counter_bits = int(math.ceil(math.log2(padT + dimY + padB + 1)))
+        x_counter_bits = int(math.ceil(math.log2(padL + dimX + padR + 1)))
+        topname = self.get_verilog_top_module_name()
+        stream_bits = idt.bitwidth() * simd
+        stream_bits = int(roundup_to_integer_multiple(stream_bits, 8))
+        code_gen_dict = {
+            "XCOUNTER_BITS": int(x_counter_bits),
+            "YCOUNTER_BITS": int(y_counter_bits),
+            "NUM_CHANNELS": int(chans),
+            "SIMD": int(simd),
+            "ELEM_BITS": idt.bitwidth(),
+            "TOP_MODULE_NAME": topname,
+            "INIT_XON": int(padL),
+            "INIT_XOFF": int(padL + dimX),
+            "INIT_XEND": int(padL + dimX + padR - 1),
+            "INIT_YON": int(padT),
+            "INIT_YOFF": int(padT + dimY),
+            "INIT_YEND": int(padT + dimY + padB - 1),
+            "STREAM_BITS": int(stream_bits),
+        }
+        return code_gen_dict
+
+    def get_dynamic_config(self, ifm_dims=None, pads=None):
+        """Returns a configuration dict to re-configure FM dimension and
+        padding amounts during runtime."""
+
+        if ifm_dims is None:
+            ifm_dims = self.get_nodeattr("ImgDim")
+        if pads is None:
+            pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(ifm_dims, pads, chans, simd, idt)
+        config = {
+            "XON": (0 * 4, (code_gen_dict["INIT_XON"])),
+            "XOFF": (1 * 4, (code_gen_dict["INIT_XOFF"])),
+            "XEND": (2 * 4, (code_gen_dict["INIT_XEND"])),
+            "YON": (3 * 4, (code_gen_dict["INIT_YON"])),
+            "YOFF": (4 * 4, (code_gen_dict["INIT_YOFF"])),
+            "YEND": (5 * 4, (code_gen_dict["INIT_YEND"])),
+        }
+        return config
+
+    def generate_hdl(self):
+        rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl"
+        template_path = rtlsrc + "/fmpadding_template.v"
+        dims = self.get_nodeattr("ImgDim")
+        pads = self.get_nodeattr("Padding")
+        chans = self.get_nodeattr("NumChannels")
+        simd = self.get_nodeattr("SIMD")
+        idt = self.get_input_datatype()
+        code_gen_dict = self.get_template_values(dims, pads, chans, simd, idt)
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name())
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(template_path, "r") as f:
+            template = f.read()
+        for key_name in code_gen_dict:
+            key = "$%s$" % key_name
+            template = template.replace(key, str(code_gen_dict[key_name]))
+
+        with open(
+            os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"),
+            "w",
+        ) as f:
+            f.write(template)
+
+        sv_files = ["fmpadding_axi.sv", "fmpadding.sv", "axi2we.sv"]
+        for sv_file in sv_files:
+            shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir)
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+        # Modified to use generated (System-)Verilog instead of HLS output products
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        verilog_files = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_verilog_top_module_name(),
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+        sourcefiles = [
+            "fmpadding_axi.sv",
+            "fmpadding.sv",
+            "axi2we.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % (f)]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+        return cmd
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        """Normally: Generates C++ code and tcl script for IP generation.
+        Here: Generates (System-)Verilog code for IP generation."""
+        self.generate_hdl()
+
+    def ipgen_singlenode_code(self):
+        """Normally: Builds the bash script for IP generation."""
+        pass
+
+    def code_generation_cppsim(self, model):
+        """Normally: Generates C++ code for simulation (cppsim)."""
+        pass
+
+    def compile_singlenode_code(self):
+        pass
+
+    def global_includes(self):
+        pass
+
+    def defines(self, var):
+        pass
+
+    def read_npy_data(self):
+        pass
+
+    def strm_decl(self):
+        pass
+
+    def docompute(self):
+        pass
+
+    def dataoutstrm(self):
+        pass
+
+    def save_as_npy(self):
+        pass
+
+    def blackboxfunction(self):
+        pass
+
+    def pragmas(self):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
index adafa7dcf36111e63fa49e0d184594fff54be99d..e7fa5bc0048b54a32ebc61482b96009fa019809e 100644
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
@@ -56,13 +56,13 @@ class GlobalAccPool_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [ch])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -71,7 +71,7 @@ class GlobalAccPool_Batch(HLSCustomOp):
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         if len(vecs) == 1:
@@ -80,7 +80,7 @@ class GlobalAccPool_Batch(HLSCustomOp):
             oshape = tuple([vecs[0]] + [1, 1, ch])
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         unfolded_shape = list(self.get_normal_output_shape())
@@ -139,11 +139,11 @@ class GlobalAccPool_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         # determine data type from image size and input type
         idt = DataType[self.get_nodeattr("inputDataType")]
@@ -155,14 +155,14 @@ class GlobalAccPool_Batch(HLSCustomOp):
             extreme_value = npixels * idt.max()
         return DataType.get_smallest_possible(extreme_value)
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         obits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py
index 9978ab0c7138aa6846a1427cd346c5257e4f8728..d1326607aa0dc5c34eef105b2ceb8ed86c1a0458 100644
--- a/src/finn/custom_op/fpgadataflow/hlscustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hlscustomop.py
@@ -29,8 +29,9 @@
 import numpy as np
 import os
 import subprocess
+import warnings
 from abc import abstractmethod
-from pyverilator.util.axi_utils import rtlsim_multi_io
+from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
@@ -42,6 +43,7 @@ from finn.util.basic import (
     pyverilate_get_liveness_threshold_cycles,
 )
 from finn.util.hls import CallHLS
+from finn.util.pyverilator import make_single_source_file
 
 from . import templates
 
@@ -107,10 +109,18 @@ class HLSCustomOp(CustomOp):
             # ID of FPGA device to which this Op is allocated, in
             # a multi-FPGA setting
             "device_id": ("i", False, 0),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 2),
-            "outFIFODepth": ("i", False, 2),
+            # input and output FIFO depths for multi-I/O nodes
+            "inFIFODepths": ("ints", False, [2]),
+            "outFIFODepths": ("ints", False, [2]),
             "output_hook": ("s", False, ""),
+            # accumulated characteristic function over two periods
+            "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)),
+            "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)),
+            # the period for which the characterization was run
+            "io_chrc_period": ("i", False, 0),
+            # amount of zero padding inserted during chrc.
+            "io_chrc_pads_in": ("ints", False, []),
+            "io_chrc_pads_out": ("ints", False, []),
         }
 
     def get_verilog_top_module_name(self):
@@ -138,6 +148,7 @@ class HLSCustomOp(CustomOp):
         intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())]
         intf_names["aximm"] = []
         intf_names["axilite"] = []
+        intf_names["ap_none"] = []
         return intf_names
 
     def get_verilog_top_filename(self):
@@ -164,7 +175,7 @@ class HLSCustomOp(CustomOp):
         # default impl only returns the HLS verilog codegen dir
         return [verilog_path]
 
-    def get_all_verilog_filenames(self):
+    def get_all_verilog_filenames(self, abspath=False):
         "Return list of all Verilog files used for this node."
 
         verilog_files = []
@@ -172,7 +183,10 @@ class HLSCustomOp(CustomOp):
         for verilog_path in verilog_paths:
             for f in os.listdir(verilog_path):
                 if f.endswith(".v"):
-                    verilog_files += [f]
+                    if abspath:
+                        verilog_files += [verilog_path + "/" + f]
+                    else:
+                        verilog_files += [f]
         return verilog_files
 
     def prepare_rtlsim(self):
@@ -182,13 +196,18 @@ class HLSCustomOp(CustomOp):
 
         if PyVerilator is None:
             raise ImportError("Installation of PyVerilator is required.")
-        verilog_paths = self.get_all_verilog_paths()
-        verilog_files = self.get_all_verilog_filenames()
+
+        verilog_files = self.get_all_verilog_filenames(abspath=True)
+        single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
+        tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
+        target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v"
+        make_single_source_file(verilog_files, target_file)
+
         # build the Verilator emu library
         sim = PyVerilator.build(
-            verilog_files,
-            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
-            verilog_path=verilog_paths,
+            self.get_verilog_top_module_name() + ".v",
+            build_dir=tmp_build_dir,
+            verilog_path=[single_src_dir],
             trace_depth=get_rtlsim_trace_depth(),
             top_module_name=self.get_verilog_top_module_name(),
         )
@@ -397,18 +416,20 @@ class HLSCustomOp(CustomOp):
         builder.build(code_gen_dir)
         self.set_nodeattr("executable_path", builder.executable_path)
 
-    def dynamic_input_to_npy(self, context, count):
+    def dynamic_input_to_npy(self, context, count, target_dir=""):
         """Saves input (given context) into .npy files.
 
         Count indicates the number of inputs that have to be saved."""
         node = self.onnx_node
-        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
-        if code_gen_dir == "":
-            raise Exception(
+        if target_dir == "":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+            if code_gen_dir == "":
+                raise Exception(
+                    """
+    Found no codegen dir for this node, did you run the prepare_cppsim transformation?
                 """
-Found no codegen dir for this node, did you run the prepare_cppsim transformation?
-            """
-            )
+                )
+            target_dir = code_gen_dir
         # create a npy file for each input of the node (in_ind is input index)
         # assuming dynamic inputs start from 0
         for in_ind in range(count):
@@ -427,7 +448,7 @@ Found no codegen dir for this node, did you run the prepare_cppsim transformatio
             # make copy before saving the array
             reshaped_input = reshaped_input.copy()
             np.save(
-                os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+                os.path.join(target_dir, "input_{}.npy".format(in_ind)),
                 reshaped_input,
             )
 
@@ -685,40 +706,48 @@ compilation transformations?
         HLSCustomOp class but has to be filled by every node."""
         pass
 
-    def get_normal_input_shape(self):
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input stream ind."""
+        raise Exception("get_input_datatype not implemented for this op")
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output stream ind."""
+        raise Exception("get_output_datatype not implemented for this op")
+
+    def get_normal_input_shape(self, ind=0):
         """Returns normal input shape if implemented."""
         raise Exception("get_normal_input_shape not implemented for this op")
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         """Returns folded output shape if implemented."""
         raise Exception("get_normal_output_shape not implemented for this op")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         """Returns folded input shape (according to synapse folding), if implemented."""
         raise Exception("get_folded_input_shape not implemented for this op")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         """Returns folded output shape (according to neuron folding), if implemented."""
         raise Exception("get_folded_output_shape not implemented for this op")
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width, if implemented."""
         raise Exception("get_instream_width not implemented for this op")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width, if implemented."""
         raise Exception("get_outstream_width not implemented for this op")
 
-    def get_instream_width_padded(self):
+    def get_instream_width_padded(self, ind=0):
         """Returns input stream width padded to a multiple of 8. This is required
         by the AXI Stream spec."""
-        in_width = self.get_instream_width()
+        in_width = self.get_instream_width(ind=ind)
         return roundup_to_integer_multiple(in_width, 8)
 
-    def get_outstream_width_padded(self):
+    def get_outstream_width_padded(self, ind=0):
         """Returns output stream width padded to a multiple of 8. This is required
         by the AXI Stream spec."""
-        out_width = self.get_outstream_width()
+        out_width = self.get_outstream_width(ind=ind)
         return roundup_to_integer_multiple(out_width, 8)
 
     def get_ap_int_max_w(self):
@@ -731,3 +760,119 @@ compilation transformations?
             "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret
         )
         return ret
+
+    def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
+        """Return the unconstrained characteristic functions for this node."""
+        # ensure rtlsim is ready
+        assert self.get_nodeattr("rtlsim_so") != "", (
+            "rtlsim not ready for " + self.onnx_node.name
+        )
+        if self.get_nodeattr("io_chrc_period") > 0:
+            warnings.warn(
+                "Skipping node %s: already has FIFO characteristic"
+                % self.onnx_node.name
+            )
+            return
+        exp_cycles = self.get_exp_cycles()
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        n_outs = np.prod(self.get_folded_output_shape()[:-1])
+        if exp_cycles == 0:
+            # try to come up with an optimistic estimate
+            exp_cycles = min(n_inps, n_outs)
+        assert (
+            exp_cycles <= period
+        ), "Period %d too short to characterize %s : expects min %d cycles" % (
+            period,
+            self.onnx_node.name,
+            exp_cycles,
+        )
+        sim = self.get_rtlsim()
+        # signal name
+        sname = "_" + self.hls_sname() + "_"
+        if override_rtlsim_dict is not None:
+            io_dict = override_rtlsim_dict
+        else:
+            io_dict = {
+                "inputs": {
+                    "in0": [0 for i in range(n_inps)],
+                },
+                "outputs": {"out": []},
+            }
+
+        # extra dicts to keep track of cycle-by-cycle transaction behavior
+        # note that we restrict key names to filter out weight streams etc
+        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key}
+        txns_out = {
+            key: [] for (key, value) in io_dict["outputs"].items() if "out" in key
+        }
+
+        def monitor_txns(sim_obj):
+            for inp in txns_in:
+                in_ready = _read_signal(sim, inp + sname + "TREADY") == 1
+                in_valid = _read_signal(sim, inp + sname + "TVALID") == 1
+                if in_ready and in_valid:
+                    txns_in[inp].append(1)
+                else:
+                    txns_in[inp].append(0)
+            for outp in txns_out:
+                if (
+                    _read_signal(sim, outp + sname + "TREADY") == 1
+                    and _read_signal(sim, outp + sname + "TVALID") == 1
+                ):
+                    txns_out[outp].append(1)
+                else:
+                    txns_out[outp].append(0)
+
+        reset_rtlsim(sim)
+        total_cycle_count = rtlsim_multi_io(
+            sim,
+            io_dict,
+            n_outs,
+            sname=sname,
+            liveness_threshold=period,
+            hook_preclk=monitor_txns,
+        )
+        assert (
+            total_cycle_count <= period
+        ), """Total cycle count from rtl simulation is higher than
+            specified period, please set the period higher than {}""".format(
+            total_cycle_count
+        )
+        self.set_nodeattr("io_chrc_period", period)
+
+        def accumulate_char_fxn(chrc):
+            p = len(chrc)
+            ret = []
+            for t in range(2 * p):
+                if t == 0:
+                    ret.append(chrc[0])
+                else:
+                    ret.append(ret[-1] + chrc[t % p])
+            return np.asarray(ret, dtype=np.int32)
+
+        all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32)
+        all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32)
+        all_pad_in = []
+        all_pad_out = []
+        for in_idx, in_strm_nm in enumerate(txns_in.keys()):
+            txn_in = txns_in[in_strm_nm]
+            if len(txn_in) < period:
+                pad_in = period - len(txn_in)
+                txn_in += [0 for x in range(pad_in)]
+            txn_in = accumulate_char_fxn(txn_in)
+            all_txns_in[in_idx, :] = txn_in
+            all_pad_in.append(pad_in)
+
+        for out_idx, out_strm_nm in enumerate(txns_out.keys()):
+            txn_out = txns_out[out_strm_nm]
+            if len(txn_out) < period:
+                pad_out = period - len(txn_out)
+                txn_out += [0 for x in range(pad_out)]
+            txn_out = accumulate_char_fxn(txn_out)
+            all_txns_out[out_idx, :] = txn_out
+            all_pad_out.append(pad_out)
+
+        self.set_nodeattr("io_chrc_in", all_txns_in)
+        self.set_nodeattr("io_chrc_out", all_txns_out)
+        self.set_nodeattr("io_chrc_pads_in", all_pad_in)
+        self.set_nodeattr("io_chrc_pads_out", all_pad_out)
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/iodma.py
index 33ee1d359c7b82494e1b5ce1b83aa5d0199f8153..65683079fc6a648de31148e398ea498f38b8d3d9 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/iodma.py
@@ -100,16 +100,16 @@ class IODMA(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         vecs = list(self.get_nodeattr("numInputVectors"))
         num_ch = self.get_nodeattr("NumChannels")
         ishape = tuple(vecs + [num_ch])
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         if self.get_nodeattr("direction") == "in":
             raise ValueError("Folded input shape not defined for input IODMA")
         else:
@@ -126,7 +126,7 @@ class IODMA(HLSCustomOp):
             shape.append(elems_per_word)
             return tuple(shape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         if self.get_nodeattr("direction") == "out":
             raise ValueError("Folded output shape not defined for output IODMA")
         else:
@@ -166,15 +166,15 @@ class IODMA(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         if self.get_nodeattr("direction") == "in":
             return self.get_nodeattr("intfWidth")
         elif self.get_nodeattr("direction") == "out":
@@ -182,7 +182,7 @@ class IODMA(HLSCustomOp):
         else:
             raise ValueError("Invalid IODMA direction, please set to in or out")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         if self.get_nodeattr("direction") == "out":
             return self.get_nodeattr("intfWidth")
         elif self.get_nodeattr("direction") == "in":
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
index 3e27ee01113392174c1206fc10e1c9abe82fdfe7..03f89bd7ecac69a9097f4f35c42bd528be709515 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect_batch.py
@@ -70,13 +70,13 @@ class LabelSelect_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         nlabels = self.get_nodeattr("Labels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         ishape = tuple(vecs + [nlabels])
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         nlabels = self.get_nodeattr("Labels")
         pe = self.get_nodeattr("PE")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -85,13 +85,13 @@ class LabelSelect_Batch(HLSCustomOp):
         folded_ishape = tuple(vecs + [folds, pe])
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         k = self.get_nodeattr("K")
         vecs = list(self.get_nodeattr("numInputVectors"))
         oshape = tuple(vecs + [k])
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         k = self.get_nodeattr("K")
         vecs = list(self.get_nodeattr("numInputVectors"))
         oshape = tuple(vecs + [k, 1])
@@ -152,24 +152,24 @@ class LabelSelect_Batch(HLSCustomOp):
 
         return info_messages
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         ret = DataType[self.get_nodeattr("outputDataType")]
         return ret
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         """Returns input stream width."""
         ibits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = pe * ibits
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """Returns output stream width."""
         return self.get_output_datatype().bitwidth()
 
diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py
index d90fa0f05ab2a92391f610ae1c4516a95a881ce4..fd3e2b5b1cfa74eb4f957df4b568e6c46da47617 100644
--- a/src/finn/custom_op/fpgadataflow/lookup.py
+++ b/src/finn/custom_op/fpgadataflow/lookup.py
@@ -75,21 +75,21 @@ class Lookup(HLSCustomOp):
         exp_cycles = int(n_inputs)
         return exp_cycles
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         return self.get_nodeattr("InputShape")
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         emb_dim = self.get_nodeattr("EmbeddingDim")
         oshape = list(ishape) + [emb_dim]
         return tuple(oshape)
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         folded_ishape = list(ishape) + [1]
         return tuple(folded_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ishape = self.get_normal_input_shape()
         mem_mode = self.get_nodeattr("mem_mode")
         emb_dim = self.get_nodeattr("EmbeddingDim")
@@ -135,19 +135,19 @@ class Lookup(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         ret = DataType[self.get_nodeattr("InputType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         ret = DataType[self.get_nodeattr("EmbeddingType")]
         return ret
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         return ibits
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         folded_oshape = self.get_folded_output_shape()
         obits = self.get_output_datatype().bitwidth()
         return obits * folded_oshape[-1]
@@ -159,8 +159,8 @@ class Lookup(HLSCustomOp):
     def global_includes(self):
         mem_mode = self.get_nodeattr("mem_mode")
         global_incls = []
+        global_incls.append('#include "lookup.hpp"')
         if mem_mode == "const":
-            global_incls.append('#include "lookup.hpp"')
             global_incls.append('#include "embeddings.hpp"')
         self.code_gen_dict["$GLOBALS$"] = global_incls
 
@@ -258,17 +258,10 @@ class Lookup(HLSCustomOp):
                 InputType, EmbeddingType >(in0, out, embeddings);"""
             ]
         elif mem_mode == "external":
-            hls_impl = """
-    if(!in0.empty()) {
-        ap_uint<T_SRC::width+EmbeddingAlign> const  base =
-            (in0.read(), ap_uint<EmbeddingAlign>(0));
-        for(unsigned  j = 0; j < EmbeddingSize; j++) {
-#pragma HLS PIPELINE II=1
-            out.write(mem[base+j]);
-        }
-    }
-            """
-            self.code_gen_dict["$DOCOMPUTE$"] = [hls_impl]
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """StreamingLookup_ext<EmbeddingSize>(in0, out, mem, size, oob_count,
+                oob_irq);"""
+            ]
 
     def blackboxfunction(self):
         mem_mode = self.get_nodeattr("mem_mode")
@@ -286,7 +279,8 @@ class Lookup(HLSCustomOp):
                 "void "
                 + self.onnx_node.name
                 + "(hls::stream<T_SRC> &in0, hls::stream<T_DST> &out, "
-                + "T_DST const *const  mem)"
+                + "T_DST const *const  mem, unsigned const size, "
+                + "unsigned &oob_count, bool &oob_irq)"
             ]
 
     def pragmas(self):
@@ -305,6 +299,13 @@ class Lookup(HLSCustomOp):
         elif mem_mode == "external":
             my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem")
             my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control")
+            my_pragmas.append(
+                "#pragma HLS INTERFACE s_axilite port=size bundle=control"
+            )
+            my_pragmas.append(
+                "#pragma HLS INTERFACE s_axilite port=oob_count bundle=control"
+            )
+            my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq")
         else:
             raise Exception("Unrecognized mem_mode: " + mem_mode)
         self.code_gen_dict["$PRAGMAS$"] = my_pragmas
@@ -475,4 +476,5 @@ class Lookup(HLSCustomOp):
         if mem_mode == "external":
             intf_names["axilite"] = ["s_axi_control"]
             intf_names["aximm"] = [("m_axi_gmem", self.get_nodeattr("ext_mem_width"))]
+            intf_names["ap_none"] = ["oob_irq"]
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 9d2717dc8c65ddb5329816880067b81b10db2c02..72128fda4cfe23db4858fe3ffe80a755733954cc 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -409,16 +409,16 @@ class MatrixVectorActivation(HLSCustomOp):
         """Returns FINN DataType of weights."""
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         in_width = i_bits * self.get_nodeattr("SIMD")
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
@@ -474,7 +474,7 @@ class MatrixVectorActivation(HLSCustomOp):
 
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         mh = self.get_nodeattr("MH")
         pe = self.get_nodeattr("PE")
         nf = mh // pe
@@ -482,13 +482,13 @@ class MatrixVectorActivation(HLSCustomOp):
         folded_output_shape = tuple(vecs + [nf, pe])
         return folded_output_shape
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         mw = self.get_nodeattr("MW")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [mw])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         mh = self.get_nodeattr("MH")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_output_shape = tuple(vecs + [mh])
@@ -576,6 +576,10 @@ class MatrixVectorActivation(HLSCustomOp):
 
     def minimize_accumulator_width(self, model):
         weights = model.get_initializer(self.onnx_node.input[1])
+        # since in the calculation the values of the weight matrix are used,
+        # for the bipolar case they need to be converted to bipolar
+        if self.get_nodeattr("binaryXnorMode"):
+            weights = 2 * weights - 1
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
         else:
@@ -702,10 +706,12 @@ class MatrixVectorActivation(HLSCustomOp):
         of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         # convert weights into hlslib-compatible format
         weight_tensor = self.get_hls_compatible_weight_tensor(weights)
@@ -1227,17 +1233,6 @@ class MatrixVectorActivation(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
         )
-        in_fifo_depth = self.get_nodeattr("inFIFODepth")
-        out_fifo_depth = self.get_nodeattr("outFIFODepth")
-        # insert depth pragmas only if specified
-        if in_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
-            )
-        if out_fifo_depth != 0:
-            self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
-            )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
@@ -1462,3 +1457,20 @@ class MatrixVectorActivation(HLSCustomOp):
             thres_count = out_features
             ret_dict[thres_param_type] = thres_count
         return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index 3bf187fa9a78ed2c812f042a29079ee1e3163d74..813f13e504eae181f4398eccbe40ad66b6e3bf16 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -42,12 +42,13 @@ class Pool_Batch(HLSCustomOp):
     Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
 
     Notes:
-    # The input shape was chosen to be compatible with im2col (only true when there
-    is not folding).
 
-    # The actual data layout produced by the hlslib kernels is different
-    for depthwise ops.
-     * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
+    * The input shape was chosen to be compatible with im2col (only true when there
+      is not folding).
+    * The actual data layout produced by the hlslib kernels is different
+      for depthwise ops.
+
+        * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE)
 
     Channels can be folded using PE (SIMD from the input perspective)
     """
@@ -74,11 +75,11 @@ class Pool_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("InputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         fxn = self.get_nodeattr("Function")
         odt = DataType[self.get_nodeattr("OutputDataType")]
@@ -98,7 +99,7 @@ class Pool_Batch(HLSCustomOp):
 
         return odt
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_ch = self.get_nodeattr("Channels")
         odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
@@ -107,7 +108,7 @@ class Pool_Batch(HLSCustomOp):
         ishape = (batch_size, *odims, k_prod * ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         ifm_ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
@@ -116,14 +117,14 @@ class Pool_Batch(HLSCustomOp):
         folded_ishape = normal_ishape[:-1] + [fold, pe]
         return tuple(folded_ishape)
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ofm_ch = self.get_nodeattr("Channels")
         odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
         oshape = (batch_size, *odims, ofm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         ifm_ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
@@ -147,13 +148,13 @@ class Pool_Batch(HLSCustomOp):
         exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size
         return int(exp_cycles)
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         in_width = int(dt_bits * pe)
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         dt_bits = self.get_output_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         out_width = int(dt_bits * pe)
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 1e6b72e4d54ede639e797f32f51fb7705ec8ce4b..a80d2bbefac96e8ec2a48e04179d3d285e78cef7 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -60,44 +60,53 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ishape = self.get_nodeattr("shape")
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         oshape = self.get_nodeattr("shape")
         return oshape
 
     def check_divisible_iowidths(self):
         impl_style = self.get_nodeattr("impl_style")
-        if impl_style == "hls":
-            # when using impl_style = hls must have the following
-            # if inWidth > outWidth: inWidth % outWidth = 0
-            # if inWidth < outWidth: outWidth % inWidth = 0
-            iwidth = self.get_nodeattr("inWidth")
-            owidth = self.get_nodeattr("outWidth")
-            if iwidth > owidth:
-                assert (
-                    iwidth % owidth == 0
-                ), """DWC InWidth is bigger than OutWidth and is not divisible by it.
-                Please adjust PE and SIMD values so that InWidth % OutWidth = 0
-                or alternatively use impl_style = vivado"""
-            else:
-                assert (
-                    owidth % iwidth == 0
-                ), """DWC OutWidth is bigger than InWidth and is not divisible by it.
-                Please adjust PE and SIMD values so that OutWidth % InWidth = 0
-                or alternatively use impl_style = vivado"""
-
-    def get_folded_input_shape(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        if impl_style == "vivado":
+            # the AXIS IP we use in vivado mode only supports
+            # stream widths that are divisible by 8
+            iwidth_d8 = iwidth % 8 == 0
+            owidth_d8 = owidth % 8 == 0
+            assert (
+                iwidth_d8 and owidth_d8
+            ), """DWC impl_style=vivado requires
+            stream widths that are divisible by 8: (%d, %d)""" % (
+                iwidth,
+                owidth,
+            )
+
+    def get_iowidth_lcm(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        return int(np.lcm(iwidth, owidth))
+
+    def needs_lcm(self):
+        iwidth = self.get_nodeattr("inWidth")
+        owidth = self.get_nodeattr("outWidth")
+        maxwidth = max(iwidth, owidth)
+        minwidth = min(iwidth, owidth)
+        impl_style = self.get_nodeattr("impl_style")
+        return (impl_style == "hls") and (maxwidth % minwidth != 0)
+
+    def get_folded_input_shape(self, ind=0):
         self.check_divisible_iowidths()
         iwidth = self.get_nodeattr("inWidth")
         ishape = self.get_normal_input_shape()
@@ -117,7 +126,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         dummy_t = dummy_t.reshape(new_shape)
         return dummy_t.shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         self.check_divisible_iowidths()
         owidth = self.get_nodeattr("outWidth")
         oshape = self.get_normal_output_shape()
@@ -142,11 +151,11 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         in_width = self.get_nodeattr("inWidth")
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         out_width = self.get_nodeattr("outWidth")
         return out_width
 
@@ -202,6 +211,16 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             "#define NumInWords %d " % numInWords,
             "#define numReps %d" % numReps,
         ]
+        if self.needs_lcm():
+            lcmWidth = self.get_iowidth_lcm()
+            assert (
+                numInWords % (lcmWidth / inWidth) == 0
+            ), "Error in DWC LCM calculation"
+            numLCMToOut = numInWords // (lcmWidth / inWidth)
+            self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth)
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define NumLCMToOut %d" % (numLCMToOut)
+            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -226,6 +245,12 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
         )
+        if self.needs_lcm():
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
+                    self.get_iowidth_lcm()
+                )
+            )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
@@ -233,9 +258,19 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
     def docompute(self):
         # TODO continue with fxns below, they are copy-pasted
         op = "StreamingDataWidthConverter_Batch"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            "%s<InWidth, OutWidth, NumInWords>(in0, out, numReps);" % (op)
-        ]
+        if self.needs_lcm():
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                'hls::stream<ap_uint<{}>> intermediate ("intermediate");'.format(
+                    self.get_iowidth_lcm()
+                ),
+                "%s<InWidth, LCMWidth, NumInWords>(in0, intermediate, numReps);" % (op),
+                "%s<LCMWidth, OutWidth, NumLCMToOut>(intermediate, out, numReps);"
+                % (op),
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                "%s<InWidth, OutWidth, NumInWords>(in0, out, numReps);" % (op)
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -287,6 +322,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
+        if self.needs_lcm():
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS DATAFLOW disable_start_propagation"
+            )
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
@@ -466,3 +505,28 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             cset_luts += outw
 
         return int(cnt_luts + cset_luts)
+
+    def prepare_rtlsim(self):
+        assert self.get_nodeattr("impl_style") != "vivado", (
+            "StreamingDataWidthConverter impl_style "
+            "cannot be vivado for rtlsim. Only impl_style=rtl supported."
+        )
+        super().prepare_rtlsim()
+
+    def code_generation_ipgen(self, model, fpgapart, clk):
+        # no codegen required for impl_style=vivado since
+        # that uses premade, configurable AXIS IP
+        if self.get_nodeattr("impl_style") == "hls":
+            super().code_generation_ipgen(model, fpgapart, clk)
+
+    def ipgen_singlenode_code(self):
+        # no IP generation required for impl_style=vivado since
+        # that uses premade, configurable AXIS IP
+        if self.get_nodeattr("impl_style") == "hls":
+            super().ipgen_singlenode_code()
+        else:
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # set ipgen_path and ip_path so that HLSSynthIP
+            # and CreatedStitchedIP transformations do not complain
+            self.set_nodeattr("ipgen_path", code_gen_dir)
+            self.set_nodeattr("ip_path", code_gen_dir)
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index a7c3cd0be59db4ba8665f8fba5be72282339b8c8..522305327ff7c5f1356aad4fdf6b9e0a942eca72 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -46,33 +46,56 @@ class StreamingFIFO(HLSCustomOp):
         self.strm_fifo_wrapper = templates.strm_fifo_wrapper
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            # FIFO depth
-            "depth": ("i", True, 0),
-            # folded shape of input/output
-            "folded_shape": ("ints", True, []),
-            # FINN DataTypes for inputs/outputs
-            "dataType": ("s", True, ""),
-            # Toggle between hls or IPI implementation
-            # rtl - use the hls generated IP during stitching
-            # vivado - use the AXI Infrastructure FIFO
-            "impl_style": ("s", False, "rtl", {"rtl", "vivado"}),
-            # FPGA resource type for FIFOs when impl_style is vivado
-            # auto -- let Vivado decide
-            # block -- use BRAM
-            # distributed -- use LUTRAM
-            # ultra -- use URAM (on UltraScale+)
-            "ram_style": (
-                "s",
-                False,
-                "auto",
-                {"auto", "block", "distributed", "ultra"},
-            ),
-        }
-        my_attrs.update(super().get_nodeattr_types())
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                # FIFO depth
+                "depth": ("i", True, 0),
+                # folded shape of input/output
+                "folded_shape": ("ints", True, []),
+                # FINN DataTypes for inputs/outputs
+                "dataType": ("s", True, ""),
+                # Toggle between hls or IPI implementation
+                # rtl - use the hls generated IP during stitching
+                # vivado - use the AXI Infrastructure FIFO
+                "impl_style": ("s", False, "rtl", {"rtl", "vivado"}),
+                # FPGA resource type for FIFOs when impl_style is vivado
+                # auto -- let Vivado decide
+                # block -- use BRAM
+                # distributed -- use LUTRAM
+                # ultra -- use URAM (on UltraScale+)
+                "ram_style": (
+                    "s",
+                    False,
+                    "auto",
+                    {"auto", "block", "distributed", "ultra"},
+                ),
+                # whether depth monitoring is enabled (impl_style=rtl only)
+                "depth_monitor": ("i", False, 0),
+                # the FIFO does not need its own FIFOs
+                "inFIFODepths": ("ints", False, [0]),
+                "outFIFODepths": ("ints", False, [0]),
+            }
+        )
 
         return my_attrs
 
+    def get_adjusted_depth(self):
+        impl = self.get_nodeattr("impl_style")
+        depth = self.get_nodeattr("depth")
+        if impl == "vivado":
+            old_depth = depth
+            # round up depth to nearest power-of-2
+            # Vivado FIFO impl may fail otherwise
+            depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth
+            if old_depth != depth:
+                warnings.warn(
+                    "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado"
+                    % (self.onnx_node.name, old_depth, depth)
+                )
+
+        return depth
+
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
@@ -97,6 +120,14 @@ class StreamingFIFO(HLSCustomOp):
     def verify_node(self):
         pass
 
+    def get_verilog_top_module_intf_names(self):
+        ret = super().get_verilog_top_module_intf_names()
+        is_rtl = self.get_nodeattr("impl_style") == "rtl"
+        is_depth_monitor = self.get_nodeattr("depth_monitor") == 1
+        if is_rtl and is_depth_monitor:
+            ret["ap_none"] = ["maxcount"]
+        return ret
+
     def get_verilog_top_module_name(self):
         "Return the Verilog top module name for this node."
 
@@ -180,10 +211,8 @@ class StreamingFIFO(HLSCustomOp):
         self.set_nodeattr("ip_vlnv", vlnv)
         self.code_gen_dict.clear()
 
-    def get_normal_input_shape(self):
-        depth = self.get_nodeattr("depth")
-        # depth has to be between 2 and 256 with the current
-        # StreamingFIFO implementation
+    def get_normal_input_shape(self, ind=0):
+        depth = self.get_adjusted_depth()
         assert depth >= 2, """Depth is too low"""
         if depth > 256 and self.get_nodeattr("impl_style") == "rtl":
             warnings.warn(
@@ -211,27 +240,33 @@ class StreamingFIFO(HLSCustomOp):
 
         return normal_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_nodeattr("folded_shape")
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         dtype = DataType[self.get_nodeattr("dataType")]
         folded_shape = self.get_nodeattr("folded_shape")
         in_width = folded_shape[-1] * dtype.bitwidth()
         return in_width
 
+    def get_input_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("dataType")]
+
+    def get_output_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("dataType")]
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
@@ -328,7 +363,7 @@ class StreamingFIFO(HLSCustomOp):
         elif impl_style == "vivado":
             cmd = []
             node_name = self.onnx_node.name
-            depth = self.get_nodeattr("depth")
+            depth = self.get_adjusted_depth()
             ram_style = self.get_nodeattr("ram_style")
             # create a hierarchy for this layer, with the same port names
             clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
@@ -393,7 +428,7 @@ class StreamingFIFO(HLSCustomOp):
         """Calculates resource estimation for BRAM"""
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         if impl == "rtl" or (impl == "vivado" and ram_type != "block"):
@@ -418,7 +453,7 @@ class StreamingFIFO(HLSCustomOp):
 
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         if impl == "rtl" or (impl == "vivado" and ram_type != "ultra"):
@@ -428,7 +463,7 @@ class StreamingFIFO(HLSCustomOp):
             return (math.ceil(depth / 4096)) * (math.ceil(W / 72))
 
     def bram_efficiency_estimation(self):
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
         bram16_est = self.bram_estimation()
         if bram16_est == 0:
@@ -441,7 +476,7 @@ class StreamingFIFO(HLSCustomOp):
         """Calculates resource estimations for LUTs"""
         impl = self.get_nodeattr("impl_style")
         ram_type = self.get_nodeattr("ram_style")
-        depth = self.get_nodeattr("depth")
+        depth = self.get_adjusted_depth()
         W = self.get_instream_width()
 
         address_luts = 2 * math.ceil(math.log(depth, 2))
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 882b40a0aaf542e6dcaf427ca3567ae78394ede5..a0e60931edd8590aaebc0560c4bd28d61d62e8ea 100755
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -57,11 +57,11 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("dataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("dataType")]
 
@@ -82,13 +82,13 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
         return (ifm_dim[0] == 1) and (k[0] == 1)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
         ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
         return ishape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         ifm_ch = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
@@ -99,7 +99,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch)
         return folded_ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim")
         k_h, k_w = tuple(self.get_nodeattr("PoolDim"))
         ifm_ch = self.get_nodeattr("NumChannels")
@@ -116,7 +116,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch)
         return oshape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # even though there is no folding in the current hlslib op,
         # insert a time multiplexing axis to remain compatible with the
         # shapes produced by the rest of the dataflow pipeline
@@ -155,7 +155,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             # TODO: adjust inaccurate formula
             return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1])))
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         dt_bits = self.get_input_datatype().bitwidth()
         pe = self.get_nodeattr("PE")
         ifm_ch = self.get_nodeattr("NumChannels")
@@ -165,7 +165,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             in_width = int(dt_bits * ifm_ch)
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         """For streaming maxpool out stream width is the same as in stream width"""
         return self.get_instream_width()
 
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index e73fa9bb2872d4a5023afb0c4e6953b4e6866b8d..c7bbc3f139b64f57943b2b099083a9611951e9c4 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -319,6 +319,7 @@ module $TOPNAME$(
 ap_clk,
 ap_rst_n,
 count,
+maxcount,
 in0_$HLS_SNAME$_TDATA,
 in0_$HLS_SNAME$_TVALID,
 in0_$HLS_SNAME$_TREADY,
@@ -330,6 +331,7 @@ out_$HLS_SNAME$_TREADY
 input   ap_clk;
 input   ap_rst_n;
 output $COUNT_RANGE$ count;
+output $COUNT_RANGE$ maxcount;
 input  $IN_RANGE$ in0_$HLS_SNAME$_TDATA;
 input   in0_$HLS_SNAME$_TVALID;
 output   in0_$HLS_SNAME$_TREADY;
@@ -346,6 +348,7 @@ $LAYER_NAME$
  .clock(ap_clk),
  .reset(!ap_rst_n),
  .count(count),
+ .maxcount(maxcount),
  .i_d(in0_$HLS_SNAME$_TDATA),
  .i_v(in0_$HLS_SNAME$_TVALID),
  .i_r(in0_$HLS_SNAME$_TREADY),
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
index 5383cc1f4bdf9eb88c7d7bd69c25231282f11c6f..d9745acf63c4685b3369ac379abde0a6c5a3f157 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py
@@ -75,9 +75,6 @@ class Thresholding_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             "weightDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
-            # input and output FIFO depths
-            "inFIFODepth": ("i", False, 0),
-            "outFIFODepth": ("i", False, 0),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -185,11 +182,11 @@ class Thresholding_Batch(HLSCustomOp):
         # total cost
         return comparator_cost + lutram_cost
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
@@ -221,11 +218,11 @@ class Thresholding_Batch(HLSCustomOp):
         self.set_nodeattr("weightDataType", tdt.name)
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         return i_bits * self.get_nodeattr("PE")
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
@@ -251,7 +248,7 @@ class Thresholding_Batch(HLSCustomOp):
         weightstream = self.get_weightstream_width()
         return max([weightstream, temp_value])
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         fold = ich // pe
@@ -259,17 +256,17 @@ class Thresholding_Batch(HLSCustomOp):
         folded_input_shape = tuple(vecs + [fold, pe])
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         # same shape as input
         return self.get_folded_input_shape()
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         ich = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
         normal_input_shape = tuple(vecs + [ich])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         # same shape as input
         return self.get_normal_input_shape()
 
@@ -357,10 +354,12 @@ class Thresholding_Batch(HLSCustomOp):
         run-time reconfig of weights.
 
         Arguments:
+
         * weights : numpy array with weights to be put into the file
         * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
           decoupled_runtime}
         * weight_file_name : filename for the weight file to be generated
+
         """
         threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
         tdt = self.get_weight_datatype()
@@ -603,13 +602,17 @@ class Thresholding_Batch(HLSCustomOp):
 
     # TODO check and add whatever missing
     def defines(self, var):
+        numReps = 1
         numInputVectors = list(self.get_nodeattr("numInputVectors"))
-        numReps = int(np.prod(numInputVectors))
+        total_spatial_size = int(np.prod(numInputVectors))
+
         self.code_gen_dict["$DEFINES$"] = [
-            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}""".format(
+            """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}\n
+               #define ImgDim1 {}""".format(
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PE"),
                 numReps,
+                total_spatial_size,
             )
         ]
         if self.get_nodeattr("mem_mode") == "decoupled":
@@ -650,7 +653,7 @@ class Thresholding_Batch(HLSCustomOp):
             npy_in = "%s/thresholds.npy" % code_gen_dir
 
             self.code_gen_dict["$READNPYDATA$"].append(
-                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, ImgDim1);'
                 % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
             )
 
@@ -672,18 +675,13 @@ class Thresholding_Batch(HLSCustomOp):
 
     def docompute(self):
         tmpl_args = self.get_template_param_values()
-        # TODO: why put some template parameters into defines and not others?
-        # should ImgDim be defined or just filled in here like we do now?
         node = self.onnx_node
-        inp_vecs = self.get_nodeattr("numInputVectors")
-        total_spatial_size = int(np.prod(inp_vecs))
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "const":
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}>
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}>
                 (in0, out, threshs, numReps);""".format(
                     node.op_type,
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
@@ -693,10 +691,9 @@ class Thresholding_Batch(HLSCustomOp):
             # - for cppsim the repetition comes from the threshold stream reader+input
             # - for synth the unit runs continuously anyway (ap_ctrl_none)
             self.code_gen_dict["$DOCOMPUTE$"] = [
-                """{}<{}, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
-                (in0, out, weights, 1);""".format(
+                """{}<ImgDim1, NumChannels1, PE1, {}, {}, ActVal1, ThresType1, NumSteps1>
+                (in0, out, weights, numReps);""".format(
                     "Thresholding_Stream_Batch",
-                    total_spatial_size,
                     tmpl_args["TSrcI"],
                     tmpl_args["TDstI"],
                 )
@@ -960,3 +957,20 @@ class Thresholding_Batch(HLSCustomOp):
         "Return a list of extra tcl directives for HLS synthesis."
 
         return ["config_compile -pipeline_style frp"]
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_tmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 7386aa7e6311754b653e94f8d2e9b2a910a1370b..1bd32442a1986d6a86571e85a09322d6c15d8a78 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -218,21 +218,21 @@ class TLastMarker(HLSCustomOp):
     def get_number_output_values(self):
         return self.get_nodeattr("NumIters")
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         elem_width = self.get_nodeattr("ElemWidth")
         n_packed_elems = stream_width // elem_width
         n_iters = self.get_nodeattr("NumIters")
         return (1, n_iters, n_packed_elems)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         return self.get_folded_input_shape()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         return stream_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         stream_width = self.get_nodeattr("StreamWidth")
         return stream_width
 
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
index b62e4f2f6784e8964232efcc9971f0b8bc35ac5d..a018fd35aac4d63b365e97464dab0fd4a5fa13f2 100644
--- a/src/finn/custom_op/fpgadataflow/upsampler.py
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-import os
 import warnings
 from qonnx.core.datatype import DataType
 
@@ -57,6 +56,8 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
             "inputDataType": ("s", True, ""),
             # Batch size
             "numInputVectors": ("i", False, 1),
+            # Dimensionality mode: 0 = 2D square, 1 = 1D in H dim
+            "DimMode": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -64,28 +65,41 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
     def get_exp_cycles(self):
         OFMDim = self.get_nodeattr("OFMDim")
         batch_size = self.get_nodeattr("numInputVectors")
-        exp_cycles = OFMDim * OFMDim * batch_size
+        is_2d = self.get_nodeattr("DimMode") == 0
+        reps = 1
+        if is_2d:
+            OFMDim = OFMDim * OFMDim
+            reps = batch_size
+        exp_cycles = OFMDim * reps
         return int(exp_cycles)
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         IFMDim = self.get_nodeattr("IFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        ishape = (batch, IFMDim, IFMDim, num_ch)
+        is_2d = self.get_nodeattr("DimMode") == 0
+        if is_2d:
+            ishape = (batch, IFMDim, IFMDim, num_ch)
+        else:
+            ishape = (batch, IFMDim, 1, num_ch)
         return ishape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         OFMDim = self.get_nodeattr("OFMDim")
         num_ch = self.get_nodeattr("NumChannels")
         batch = self.get_nodeattr("numInputVectors")
-        oshape = (batch, OFMDim, OFMDim, num_ch)
+        is_2d = self.get_nodeattr("DimMode") == 0
+        if is_2d:
+            oshape = (batch, OFMDim, OFMDim, num_ch)
+        else:
+            oshape = (batch, OFMDim, 1, num_ch)
         return oshape
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         normal_ishape = list(self.get_normal_input_shape())
         return tuple(normal_ishape)
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         normal_oshape = list(self.get_normal_output_shape())
         return tuple(normal_oshape)
 
@@ -115,21 +129,21 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
         return ret
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output. (Same as input datatype)"""
         return self.get_input_datatype()
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         return ibits * ifm_ch
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         return obits * ifm_ch
@@ -187,10 +201,19 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
         )
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
-            ap_uint<Input_precision> > (in0, out, numReps);"""
-        ]
+        is_2d = self.get_nodeattr("DimMode") == 0
+        batch = self.get_nodeattr("numInputVectors")
+        if is_2d:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
+                ap_uint<Input_precision> > (in0, out, numReps);"""
+            ]
+        else:
+            assert batch == 1, "1D upsampler currently needs numReps=1"
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """UpsampleNearestNeighbour_1D<OFMDim, IFMDim, IFMChannels,
+                ap_uint<Input_precision> > (in0, out);"""
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -246,7 +269,6 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
-        folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
@@ -268,9 +290,7 @@ class UpsampleNearestNeighbour_Batch(HLSCustomOp):
         ), """Input shape doesn't
         match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
         export_idt = self.get_input_datatype()
-
-        reshaped_input = inp.reshape(folded_ishape)
-        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+        self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir)
 
         if mode == "cppsim":
             # execute the precompiled model
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index 27b23dd32835c265759a8cabfd2a3412844077ca..d5e29ca22acf89440c3c3a66101bec89d4a66d46 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -29,6 +29,7 @@
 import math
 import numpy as np
 import os
+import textwrap
 import warnings
 from qonnx.core.datatype import DataType
 from qonnx.util.basic import (
@@ -41,6 +42,7 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
 from finn.util.data_packing import (
     npy_to_rtlsim_input,
     numpy_to_hls_code,
+    pack_innermost_dim_as_hex_string,
     rtlsim_output_to_npy,
 )
 
@@ -67,6 +69,36 @@ class VectorVectorActivation(HLSCustomOp):
             "accDataType": ("s", False, "INT32"),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0, {0, 1}),
+            # memory mode for the layer weights
+            # const -- embedded weights, default, long compile/synth times
+            # decoupled -- streaming weights with weight streamer packaged inside IP
+            # external -- streaming weights with external streamer
+            "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}),
+            # (mem_mode = decoupled only) whether weights will be writable through
+            # an AXI-lite interface during runtime
+            # 1 for enabled, 0 for disabled.
+            # see finn-rtllib/memstream/doc/README for more about the memory
+            # address map used for writable weights
+            # IMPORTANT: After using AXI lite to either read or write the weights,
+            # always "flush" the accelerator by first passing a dummy input
+            # vector through the accelerator. This will get rid of any old
+            # weight data from the weight FIFOs.
+            "runtime_writeable_weights": ("i", False, 0, {0, 1}),
+            # FPGA resource type for memories in decoupled mode
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1
+            # see also https://www.xilinx.com/support/answers/38070.html
+            "ram_style": (
+                "s",
+                False,
+                "auto",
+                {"auto", "block", "distributed", "ultra"},
+            ),
+            # use xnor-popcount for binary weights/inputs, thus treating them
+            # as bipolar
+            "binaryXnorMode": ("i", False, 0, {0, 1}),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -176,7 +208,7 @@ class VectorVectorActivation(HLSCustomOp):
     def verify_node(self):
         pass
 
-    def get_input_datatype(self):
+    def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         return DataType[self.get_nodeattr("inputDataType")]
 
@@ -184,31 +216,40 @@ class VectorVectorActivation(HLSCustomOp):
         """Returns FINN DataType of weights."""
         return DataType[self.get_nodeattr("weightDataType")]
 
-    def get_output_datatype(self):
+    def get_output_datatype(self, ind=0):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_instream_width(self):
+    def get_instream_width(self, ind=0):
         i_bits = self.get_input_datatype().bitwidth()
         in_width = i_bits * self.get_nodeattr("PE")
         return in_width
 
-    def get_outstream_width(self):
+    def get_outstream_width(self, ind=0):
         o_bits = self.get_output_datatype().bitwidth()
         out_width = o_bits * self.get_nodeattr("PE")
         return out_width
 
-    def get_folded_input_shape(self):
+    def get_folded_input_shape(self, ind=0):
         k_h, k_w = self.get_nodeattr("Kernel")
         sf = k_h * k_w
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         nf = ch // pe
-        folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe])
+
+        if ind == 0:
+            # calculate shape of input 0
+            folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe])
+        elif ind == 1 and self.get_nodeattr("mem_mode") == "external":
+            # calculate shape of input 1 (weights)
+            folded_input_shape = tuple([1, sf * nf, pe])
+        else:
+            raise Exception("Undefined input shape for requested input")
+
         return folded_input_shape
 
-    def get_folded_output_shape(self):
+    def get_folded_output_shape(self, ind=0):
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         nf = ch // pe
@@ -216,14 +257,14 @@ class VectorVectorActivation(HLSCustomOp):
         folded_output_shape = tuple([1, dim_h, dim_w, nf, pe])
         return folded_output_shape
 
-    def get_normal_input_shape(self):
+    def get_normal_input_shape(self, ind=0):
         dim_h, dim_w = self.get_nodeattr("Dim")
         ch = self.get_nodeattr("Channels")
         k_h, k_w = self.get_nodeattr("Kernel")
         normal_input_shape = tuple([1, dim_h, dim_w, k_h * k_w * ch])
         return normal_input_shape
 
-    def get_normal_output_shape(self):
+    def get_normal_output_shape(self, ind=0):
         ch = self.get_nodeattr("Channels")
         dim_h, dim_w = self.get_nodeattr("Dim")
         normal_output_shape = tuple([1, dim_h, dim_w, ch])
@@ -251,13 +292,31 @@ class VectorVectorActivation(HLSCustomOp):
         ret = dict()
         inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
         out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+            raise Exception("True binary (non-bipolar) inputs not yet supported")
         inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
         wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
         # fill in TSrcI and TWeightI
-        # TODO handle bipolar inputs
-        if inp_is_bipolar or wt_is_bipolar:
-            raise Exception("VVAU node doesn't support bipolar values yet.")
-        else:
+        # TODO check these with Giulio
+        # TODO handle non-bipolar binary inputs
+        if inp_is_bipolar and wt_is_bipolar:
+            ret["TSrcI"] = "Recast<XnorMul>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and wt_is_bipolar:
+            ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+            ret["TWeightI"] = "Recast<Binary>"
+        elif inp_is_bipolar and (not wt_is_bipolar):
+            ret["TSrcI"] = "Recast<Binary>"
+            ret["TWeightI"] = "Identity"
+        elif (not inp_is_bipolar) and (not wt_is_bipolar):
             ret["TSrcI"] = "Slice<%s>" % inp_hls_str
             ret["TWeightI"] = "Identity"
 
@@ -286,6 +345,13 @@ class VectorVectorActivation(HLSCustomOp):
         return ret
 
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure MH % PE == 0
+        * for bipolar weights&inputs, ensure thresholds are positive
+        * interleave rows between PEs
+        * reshape into (PE, TMEM, n_thres_steps) and return
+        """
         ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         tmem = self.calc_tmem()
@@ -295,14 +361,33 @@ class VectorVectorActivation(HLSCustomOp):
         ), """Threshold matrix dimension is
         not as expected (2)."""
         n_thres_steps = orig_thres_matrix.shape[1]
+        inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+        wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+        # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+        inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+        wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+        bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+        inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+        wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+        if inp_is_bipolar and wt_is_bipolar:
+            # ensure all thresholds are nonnegative
+            assert (orig_thres_matrix >= 0).all()
+            # ensure all thresholds are integer
+            assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all()
         ret = orig_thres_matrix
         # workaround for vivado_hls threshold bug
-        if ret[0][0] == 0:
+        if ret[0][0] == 0 and n_thres_steps == 1:
             ret = np.copy(ret)
             ret[0][0] = 1
             warnings.warn(
                 "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
             )
+        # ensure channels = mh , duplicating if necessary
+        if ret.shape[0] == 1:
+            ret = np.tile(ret, (ch, 1))
+        assert (
+            ret.shape[0] == ch
+        ), "Channels of threshold matrix are not as expected (ch)"
         # distribute rows between PEs
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         assert (
@@ -319,43 +404,175 @@ class VectorVectorActivation(HLSCustomOp):
         rows between PEs is not as expected (n_thres_steps)"""
         return ret.reshape(1, pe, tmem, n_thres_steps)
 
-    def generate_params(self, model, path):
-        # weights
-        weights = model.get_initializer(self.onnx_node.input[1])
-        # convert weights into hlslib-compatible format
-        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
-        wdt = self.get_weight_datatype()
-        code_gen_dir = path
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
 
-        """Saves weights into params.h"""
-        weight_hls_code = numpy_to_hls_code(weight_tensor, wdt, "weights", True, True)
-        # write weights into params.h
-        f_weights = open("{}/params.h".format(code_gen_dir), "w")
+        Arguments:
 
-        if wdt.bitwidth() != 1:
-            f_weights.write(
-                "const FixedPointWeights<1,{},{},{}> weights = ".format(
-                    wdt.get_hls_datatype_str(),
-                    self.get_nodeattr("PE"),
-                    self.calc_wmem(),
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hls_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        # we have converted bipolar weights to binary for export,
+        # so use it as such for weight generation
+        if self.get_weight_datatype() == DataType["BIPOLAR"]:
+            export_wdt = DataType["BINARY"]
+        if weight_file_mode == "hls_header":
+            weight_hls_code = numpy_to_hls_code(
+                weight_tensor, export_wdt, "weights", True, True
+            )
+            # write weights into C++ header file as dictated by finn-hlslib
+            f_weights = open(weight_file_name, "w")
+            if export_wdt.bitwidth() != 1:
+                f_weights.write(
+                    "const FixedPointWeights<1,{},{},{}> weights = ".format(
+                        export_wdt.get_hls_datatype_str(),
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
+                )
+            else:
+                f_weights.write(
+                    "const BinaryWeights<1,{},{}> weights = ".format(
+                        self.get_nodeattr("PE"),
+                        self.calc_wmem(),
+                    )
                 )
+            f_weights.write(weight_hls_code)
+            f_weights.close()
+        elif "decoupled" in weight_file_mode:
+            # create a weight stream for various flavors of decoupled mode:
+            # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
+            weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
+            pe = self.get_nodeattr("PE")
+            simd = 1
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
             )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
+            # flipped
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
+            if weight_file_mode == "decoupled_npy":
+                # save weight stream into npy for cppsim
+                np.save(weight_file_name, weight_tensor_simd_flipped)
+            elif weight_file_mode == "decoupled_verilog_dat":
+                # convert weight values into hexstring
+                weight_width = self.get_weightstream_width()
+                # pad to nearest 4 bits to get hex strings
+                weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                # add zeroes to pad out file to 1024 entries
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        f.write(val + "\n")
+            elif weight_file_mode == "decoupled_runtime":
+                # memstream axi-lite interface will map each mem line to
+                # one or multiple 32-bit words
+                weight_width = self.get_weightstream_width()
+                words_per_memwidth = 2 ** math.ceil(math.log2(weight_width / 32))
+                if words_per_memwidth < 1:
+                    words_per_memwidth = 1
+                weight_width_padded = words_per_memwidth * 32
+                # first, pack and ensure padding to 32 bits
+                weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                    weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
+                )
+                weight_stream = weight_tensor_pe_flipped.flatten()
+                weight_stream = weight_stream.copy()
+                with open(weight_file_name, "w") as f:
+                    for val in weight_stream:
+                        # split into groups of 8 hex digits (= 32 bits)
+                        words_32b = textwrap.wrap(val, 8)
+                        words_32b.reverse()
+                        for word_32b in words_32b:
+                            f.write(word_32b + "\n")
+            else:
+                raise Exception("Unknown weight_file_mode")
+
         else:
-            f_weights.write(
-                "const BinaryWeights<1,{},{}> weights = ".format(
-                    self.get_nodeattr("PE"), self.calc_wmem()
+            raise Exception("Unknown weight_file_mode")
+
+    def generate_params(self, model, path):
+        mem_mode = self.get_nodeattr("mem_mode")
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        if mem_mode == "const":
+            # save hlslib-compatible weights in params.h
+            weight_filename = "{}/params.h".format(code_gen_dir)
+            self.make_weight_file(weights, "hls_header", weight_filename)
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            weight_filename_sim = "{}/weights.npy".format(code_gen_dir)
+            # save decoupled weights for cppsim
+            self.make_weight_file(weights, "decoupled_npy", weight_filename_sim)
+            if mem_mode == "decoupled":
+                # also save weights as Verilog .dat file
+                # note that we provide two different .dat files, one for synth
+                # and one for synthesis. this is because URAM-based weights always
+                # need zero weights for synthesis, otherwise they get inferred
+                # as BRAM
+                weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(
+                    code_gen_dir
+                )
+                weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
+                # sim weights are always the true weights
+                self.make_weight_file(
+                    weights, "decoupled_verilog_dat", weight_filename_rtl_sim
                 )
+                ram_style = self.get_nodeattr("ram_style")
+                if ram_style == "ultra":
+                    # UltraRAM must have no memory initializer, or only zeroes
+                    # otherwise BRAM will be inferred instead of URAM
+                    # as a workaround we provide a zero-weight init here
+                    synth_weights = np.zeros_like(weights, dtype=np.float32)
+                else:
+                    synth_weights = weights
+                self.make_weight_file(
+                    synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth
+                )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
-        f_weights.write(weight_hls_code)
-        f_weights.close()
 
         # save thresholds in thresh.h
         if len(self.onnx_node.input) > 2:
             thresholds = model.get_initializer(self.onnx_node.input[2])
             if thresholds is not None:
                 threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
+                # use UINT32 threshold export for bipolar times bipolar
+                inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+                wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+                # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+                inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+                wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+                bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+                inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+                wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
                 # get computed threshold datatype from attribute
                 tdt = DataType[self.get_nodeattr("accDataType")]
+
                 assert np.vectorize(tdt.allowed)(
                     threshold_tensor
                 ).all(), "Thresholds in %s can't be expressed with type %s" % (
@@ -368,8 +585,11 @@ class VectorVectorActivation(HLSCustomOp):
                 # write thresholds into thresh.h
                 f_thresh = open("{}/thresh.h".format(code_gen_dir), "w")
                 tdt_hls = tdt.get_hls_datatype_str()
-                odt = self.get_output_datatype()
-                odt_hls = odt.get_hls_datatype_str()
+                # use binary to export bipolar activations
+                export_odt = self.get_output_datatype()
+                if self.get_output_datatype() == DataType["BIPOLAR"]:
+                    export_odt = DataType["BINARY"]
+                odt_hls = export_odt.get_hls_datatype_str()
                 f_thresh.write(
                     "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \
                     = ".format(
@@ -387,6 +607,7 @@ class VectorVectorActivation(HLSCustomOp):
 
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
+        mem_mode = self.get_nodeattr("mem_mode")
         node = self.onnx_node
 
         # TODO ensure codegen dir exists
@@ -440,7 +661,28 @@ class VectorVectorActivation(HLSCustomOp):
             inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits)
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            output = self.rtlsim(sim, inp)
+
+            if mem_mode == "external" or mem_mode == "decoupled":
+                wnbits = self.get_weightstream_width()
+                export_wdt = self.get_weight_datatype()
+                # we have converted bipolar weights to binary for export,
+                # so use it as such for weight generation
+                if self.get_weight_datatype() == DataType["BIPOLAR"]:
+                    export_wdt = DataType["BINARY"]
+                wei = npy_to_rtlsim_input(
+                    "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits
+                )
+                dim_h, dim_w = self.get_nodeattr("Dim")
+                num_w_reps = dim_h * dim_w
+
+                io_dict = {
+                    "inputs": {"in0": inp, "weights": wei * num_w_reps},
+                    "outputs": {"out": []},
+                }
+                self.rtlsim_multi_io(sim, io_dict)
+                output = io_dict["outputs"]["out"]
+            else:
+                output = self.rtlsim(sim, inp)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
@@ -466,6 +708,12 @@ class VectorVectorActivation(HLSCustomOp):
     def global_includes(self):
         self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
         self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode not in ["const", "decoupled", "external"]:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
+            )
         if self.calc_tmem() != 0:
             self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
 
@@ -474,6 +722,8 @@ class VectorVectorActivation(HLSCustomOp):
         numReps = 1 * dim_h * dim_w
         k_h, k_w = self.get_nodeattr("Kernel")
         innerProdDim = k_h * k_w
+        mem_mode = self.get_nodeattr("mem_mode")
+
         self.code_gen_dict["$DEFINES$"] = [
             """#define Channels1 {}\n #define InnerProdDim {}\n
             #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format(
@@ -483,6 +733,11 @@ class VectorVectorActivation(HLSCustomOp):
                 numReps,
             )
         ]
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            self.code_gen_dict["$DEFINES$"].append(
+                "#define WP1 {}\n".format(wdt.bitwidth())
+            )
 
     def read_npy_data(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -500,7 +755,23 @@ class VectorVectorActivation(HLSCustomOp):
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
 
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            elem_bits = wdt.bitwidth()
+            packed_bits = self.get_weightstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            elem_hls_type = wdt.get_hls_datatype_str()
+            npy_type = "float"
+            npy_in = "%s/weights.npy" % code_gen_dir
+
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);'
+                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+            )
+
     def strm_decl(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
@@ -508,8 +779,15 @@ class VectorVectorActivation(HLSCustomOp):
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
+        if mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<{}>> weights ("weights");'.format(
+                    self.get_weightstream_width()
+                )
+            )
 
     def docompute(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         map_to_hls_mult_style = {
             "auto": "ap_resource_dflt()",
             "lut": "ap_resource_lut()",
@@ -521,16 +799,42 @@ class VectorVectorActivation(HLSCustomOp):
             threshs = "PassThroughActivation<%s>()" % odtype_hls_str
         else:
             threshs = "threshs"
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
-            (in0, out, weights, {}, numReps, {});""".format(
-                tmpl_args["TSrcI"],
-                tmpl_args["TDstI"],
-                tmpl_args["TWeightI"],
-                threshs,
-                map_to_hls_mult_style[self.get_nodeattr("resType")],
+
+        if mem_mode == "const":
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """Vector_Vector_Activate_Batch<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}>
+                (in0, out, weights, {}, numReps, {});""".format(
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            wdt = self.get_weight_datatype()
+            if wdt == DataType["BIPOLAR"]:
+                export_wdt = DataType["BINARY"]
+            else:
+                export_wdt = wdt
+            wdtype_hls_str = export_wdt.get_hls_datatype_str()
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<Channels1, InnerProdDim, SIMD1, PE1, 1, {}, {}, {}, {}>
+                (in0, out, weights, {}, numReps, {});""".format(
+                    "Vector_Vector_Activate_Stream_Batch",
+                    tmpl_args["TSrcI"],
+                    tmpl_args["TDstI"],
+                    tmpl_args["TWeightI"],
+                    wdtype_hls_str,
+                    threshs,
+                    map_to_hls_mult_style[self.get_nodeattr("resType")],
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or "external",
+                currently no other parameter value is supported!"""
             )
-        ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -561,44 +865,72 @@ class VectorVectorActivation(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-            hls::stream<ap_uint<{}>> &out
-            )""".format(
-                self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(hls::stream<ap_uint<{}>> &in0,
+                hls::stream<ap_uint<{}>> &out
+                )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        elif mem_mode == "decoupled" or mem_mode == "external":
+            self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+                """void {}(
+                    hls::stream<ap_uint<{}>> &in0,
+                    hls::stream<ap_uint<{}>> &weights,
+                    hls::stream<ap_uint<{}>> &out
+                    )""".format(
+                    self.onnx_node.name,
+                    self.get_instream_width(),
+                    self.get_weightstream_width(),
+                    self.get_outstream_width(),
+                )
+            ]
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                    parameter value is supported!"""
             )
-        ]
 
     def pragmas(self):
+        mem_mode = self.get_nodeattr("mem_mode")
         self.code_gen_dict["$PRAGMAS$"] = [
             "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
         )
-        in_fifo_depth = self.get_nodeattr("inFIFODepth")
-        out_fifo_depth = self.get_nodeattr("outFIFODepth")
-        # insert depth pragmas only if specified
-        if in_fifo_depth != 0:
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+        if mem_mode == "const":
+            self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+            # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
+            # partition for parallel access along the PE dimension (dim 1)
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+                (
+                    "#pragma HLS ARRAY_PARTITION variable=weights.m_weights "
+                    "complete dim=1"
+                )
             )
-        if out_fifo_depth != 0:
+        elif mem_mode == "decoupled" or mem_mode == "external":
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
+                "#pragma HLS INTERFACE axis port=weights name=weights_"
+                + self.hls_sname()
+            )
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=8 variable=weights"
+            )
+        else:
+            raise Exception(
+                """Please set mem_mode to "const", "decoupled", or external,
+                currently no other parameter value is supported!"""
             )
-        self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE ap_ctrl_none port=return"
-        )
 
-        self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
-        # the weight tensor is ap_uint<ch*prec> [PE][WMEM]
-        # partition for parallel access along the PE dimension (dim 1)
-        self.code_gen_dict["$PRAGMAS$"].append(
-            ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
-        )
         if self.calc_tmem() != 0:
             # TODO find a better way of checking for no pregenerated thresholds
             self.code_gen_dict["$PRAGMAS$"].append(
@@ -614,6 +946,157 @@ class VectorVectorActivation(HLSCustomOp):
                 )
             )
 
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        mem_mode = self.get_nodeattr("mem_mode")
+        sname = self.hls_sname()
+        if mem_mode == "external":
+            intf_names["s_axis"].append(
+                ("weights_" + sname, self.get_weightstream_width_padded())
+            )
+        if mem_mode == "decoupled":
+            # only expose axilite interface if attribute is set
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if runtime_writable:
+                intf_names["axilite"] = ["s_axilite"]
+        return intf_names
+
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "decoupled":
+            runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
+            if self.get_nodeattr("ram_style") == "ultra":
+                assert (
+                    runtime_writable == 1
+                ), "Layer with URAM weights must have runtime_writeable_weights=1"
+            node_name = self.onnx_node.name
+            sname = self.hls_sname()
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
+            )
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_%s]"
+                % (node_name, strm_inst, node_name, node_name, sname)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
+            )
+            if runtime_writable:
+                # expose axi lite interface for writeable weights
+                axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
+                cmd.append(
+                    "create_bd_intf_pin -mode Slave "
+                    "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
+                    % (node_name, axilite_name)
+                )
+                cmd.append(
+                    "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                    "[get_bd_intf_pins %s/%s/%s]"
+                    % (node_name, axilite_name, node_name, strm_inst, axilite_name)
+                )
+                # TODO calculate and pass in segment size here
+                cmd.append("assign_bd_address")
+            cmd.append("save_bd_design")
+        elif mem_mode == "const" or mem_mode == "external":
+            # base class impl sufficient for const/external modes
+            return super().code_generation_ipi()
+        else:
+            raise Exception("Unrecognized mem_mode for VectorVectorActivation")
+        return cmd
+
+    def uram_estimation(self):
+        P = self.get_nodeattr("PE")
+        Q = 1
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        omega = self.calc_wmem()
+        mem_width = Q * W * P
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle != "ultra")
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
+            return 0
+        width_multiplier = math.ceil(mem_width / 72)
+        depth_multiplier = math.ceil(omega / 4096)
+        return width_multiplier * depth_multiplier
+
     def bram_estimation(self):
         """Calculates resource estimation for BRAM"""
         # TODO add in/out FIFO contributions
@@ -624,7 +1107,13 @@ class VectorVectorActivation(HLSCustomOp):
         # assuming SDP mode RAMB18s (see UG573 Table 1-10)
         # since this is HLS memory, not using the full width of a BRAM
         # assuming memories up to 128 deep get implemented in LUTs
-        if self.calc_wmem() <= 128:
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (
+            (mmode == "decoupled" and mstyle in ["distributed", "ultra"])
+            or (mmode == "const" and self.calc_wmem() <= 128)
+            or (mmode == "external")
+        ):
             return 0
 
         if W == 1:
@@ -671,8 +1160,12 @@ class VectorVectorActivation(HLSCustomOp):
         c0 = 300
         c1 = 1.1
         c2 = 0
-        if self.calc_wmem() <= 128:
-            c2 = P * W * math.ceil(self.calc_wmem() / 64)
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "decoupled" and mstyle == "distributed") or (
+            mmode == "const" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * W) * math.ceil(self.calc_wmem() / 64)
 
         # multiplication
         res_type = self.get_nodeattr("resType")
@@ -710,6 +1203,25 @@ class VectorVectorActivation(HLSCustomOp):
             mult_dsp = 0
         return int(mult_dsp)
 
+    def get_weightstream_width(self):
+        """Returns weight stream width. Used only in decoupled mode."""
+        if (
+            self.get_nodeattr("mem_mode") == "decoupled"
+            or self.get_nodeattr("mem_mode") == "external"
+        ):
+            pe = self.get_nodeattr("PE")
+            wp = self.get_weight_datatype().bitwidth()
+            w_width = pe * wp
+            return w_width
+        else:
+            return 0
+
+    def get_weightstream_width_padded(self):
+        """Returns weight stream width padded to a multiple of 8. This is required
+        by the AXI Stream spec. Used in decoupled mode."""
+        weight_width = self.get_weightstream_width()
+        return roundup_to_integer_multiple(weight_width, 8)
+
     def get_op_and_param_counts(self):
         k_h, k_w = self.get_nodeattr("Kernel")
         fm = self.get_nodeattr("Channels")
@@ -733,3 +1245,20 @@ class VectorVectorActivation(HLSCustomOp):
             thres_count = fm
             ret_dict[thres_param_type] = thres_count
         return ret_dict
+
+    def derive_characteristic_fxns(self, period):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [
+                0 for i in range(num_w_reps * n_weight_inps)
+            ]
+        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/qnn-data/build_dataflow/expected_output.npy b/src/finn/qnn-data/build_dataflow/expected_output.npy
index a8d09384633791b7e3760dc8a2d1ba88a05d526d..98037351bb4ee49985a98631750f18e9b86965b1 100644
Binary files a/src/finn/qnn-data/build_dataflow/expected_output.npy and b/src/finn/qnn-data/build_dataflow/expected_output.npy differ
diff --git a/src/finn/qnn-data/build_dataflow/input.npy b/src/finn/qnn-data/build_dataflow/input.npy
index edd24de05a33a15ebc330cdab31f3d77d2c47196..8bece67b7daf5b7668ff5e7515f15a891146b00b 100644
Binary files a/src/finn/qnn-data/build_dataflow/input.npy and b/src/finn/qnn-data/build_dataflow/input.npy differ
diff --git a/src/finn/qnn-data/cpp/verilator_fifosim.cpp b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0aca9efe77806d31192f35a1d751b32116218f8
--- /dev/null
+++ b/src/finn/qnn-data/cpp/verilator_fifosim.cpp
@@ -0,0 +1,197 @@
+/* Copyright (C) 2022, Advanced Micro Devices, Inc.
+All rights reserved.
+#
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+#
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+#
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+#
+* Neither the name of FINN nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+#
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+#include <iostream>
+#include <fstream>
+#include <cstddef>
+#include <chrono>
+#include "verilated.h"
+#include "verilated_vcd_c.h"
+#include "Vfinn_design_wrapper.h"
+
+#ifdef DEBUG
+#define TRACE(x) x
+#else
+#define TRACE(x) ;
+#endif
+
+using namespace std;
+
+Vfinn_design_wrapper * top;
+
+// code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+// this is required by verilator for verilog designs using $time
+// main_time is incremented in eval
+double main_time = 0;
+
+double sc_time_stamp() {
+return main_time;
+}
+// function definitions
+// helper functions for basic verilator tasks
+extern "C" { //Open an extern C closed below
+Vfinn_design_wrapper* construct() {
+    Verilated::commandArgs(0, (const char**) nullptr);
+    TRACE(Verilated::traceEverOn(true));
+    Vfinn_design_wrapper* top = new Vfinn_design_wrapper();
+    return top;
+}
+int eval(Vfinn_design_wrapper* top) {
+    top->eval();
+    main_time++;
+    return 0;
+}
+int destruct(Vfinn_design_wrapper* top) {
+    if (top != nullptr) {
+        delete top;
+        top = nullptr;
+    }
+    return 0;
+}
+
+TRACE(
+VerilatedVcdC* tfp;
+VerilatedVcdC* start_vcd_trace(Vfinn_design_wrapper* top, const char* filename) {
+    VerilatedVcdC* tfp = new VerilatedVcdC;
+    top->trace(tfp, 99);
+    tfp->open(filename);
+    return tfp;
+}
+int add_to_vcd_trace(VerilatedVcdC* tfp, int time) {
+    tfp->dump(time);
+    return 0;
+}
+int flush_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->flush();
+    return 0;
+}
+int stop_vcd_trace(VerilatedVcdC* tfp) {
+    tfp->close();
+    return 0;
+}
+)
+
+}
+
+// end of code taken from pyverilator_wrapper.cpp generated by PyVerilator
+
+inline void toggle_clk() {
+    eval(top);
+    top->ap_clk = 1;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+    eval(top);
+    top->ap_clk = 0;
+    TRACE(add_to_vcd_trace(tfp, main_time));
+}
+
+
+void reset() {
+    top->ap_rst_n = 0;
+    for(unsigned i = 0; i < 10; i++) {
+        toggle_clk();
+    }
+    top->ap_rst_n = 1;
+}
+
+int main(int argc, char *argv[]) {
+    top = construct();
+    TRACE(tfp = start_vcd_trace(top, "trace.vcd"));
+    unsigned n_iters_per_input = @ITERS_PER_INPUT@;
+    unsigned n_iters_per_output = @ITERS_PER_OUTPUT@;
+    unsigned n_inputs = @N_INPUTS@;
+    unsigned max_iters = @MAX_ITERS@;
+
+    reset();
+
+    top->m_axis_0_tready = 1;
+    top->s_axis_0_tvalid = 1;
+
+    unsigned n_in_txns = 0, n_out_txns = 0, iters = 0, last_output_at = 0;
+    unsigned latency = 0;
+
+    bool exit_criterion = false;
+
+    cout << "Simulation starting" << endl;
+    cout << "Number of inputs to write " << n_iters_per_input * n_inputs << endl;
+    cout << "Number of outputs to expect " << n_iters_per_output * n_inputs << endl;
+    cout << "No-output timeout clock cycles " << max_iters << endl;
+
+    chrono::steady_clock::time_point begin = chrono::steady_clock::now();
+
+    while(!exit_criterion) {
+        toggle_clk();
+        iters++;
+        if(iters % 1000 == 0) {
+            cout << "Elapsed iters " << iters << " inps " << n_in_txns << " outs " << n_out_txns << endl;
+            chrono::steady_clock::time_point end = chrono::steady_clock::now();
+            cout << "Elapsed since last report = " << chrono::duration_cast<chrono::seconds>(end - begin).count() << "[s]" << endl;
+            begin = end;
+        }
+        if(top->s_axis_0_tready == 1 && top->s_axis_0_tvalid == 1) {
+            n_in_txns++;
+            if(n_in_txns == n_iters_per_input * n_inputs) {
+                top->s_axis_0_tvalid = 0;
+                cout << "All inputs written at cycle " << iters << endl;
+            }
+        }
+        if(top->m_axis_0_tvalid == 1) {
+            n_out_txns++;
+            last_output_at = iters;
+            if(n_out_txns == n_iters_per_output) {
+                latency = iters;
+            }
+        }
+
+        exit_criterion = ((n_in_txns >= n_iters_per_input * n_inputs) && (n_out_txns >= n_iters_per_output * n_inputs)) || ((iters-last_output_at) > max_iters);
+    }
+
+    TRACE(flush_vcd_trace(tfp));
+    TRACE(stop_vcd_trace(tfp));
+
+    cout << "Simulation finished" << endl;
+    cout << "Number of inputs consumed " << n_in_txns << endl;
+    cout << "Number of outputs produced " << n_out_txns << endl;
+    cout << "Number of clock cycles " << iters << endl;
+
+    ofstream results_file;
+    results_file.open("results.txt", ios::out | ios::trunc);
+    results_file << "N_IN_TXNS" << "\t" << n_in_txns << endl;
+    results_file << "N_OUT_TXNS" << "\t" << n_out_txns << endl;
+    results_file << "cycles" << "\t" << iters << endl;
+    results_file << "N" << "\t" << n_inputs << endl;
+    results_file << "latency_cycles" << "\t" << latency << endl;
+@FIFO_DEPTH_LOGGING@
+    results_file.close();
+
+
+
+    destruct(top);
+
+    return 0;
+}
diff --git a/src/finn/qnn-data/testcase/residual_testcase.onnx b/src/finn/qnn-data/testcase/residual_testcase.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..c96e8c694e3a39cdb9e5d984e1c069ceb55b3f2a
Binary files /dev/null and b/src/finn/qnn-data/testcase/residual_testcase.onnx differ
diff --git a/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
new file mode 100644
index 0000000000000000000000000000000000000000..1c8b6403e8628e3647810ca5fca65ca1122eaf9d
--- /dev/null
+++ b/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh
@@ -0,0 +1,346 @@
+//  (c) Copyright 2011-2013 Xilinx, Inc. All rights reserved.
+//
+//  This file contains confidential and proprietary information
+//  of Xilinx, Inc. and is protected under U.S. and
+//  international copyright and other intellectual property
+//  laws.
+//
+//  DISCLAIMER
+//  This disclaimer is not a license and does not grant any
+//  rights to the materials distributed herewith. Except as
+//  otherwise provided in a valid license issued to you by
+//  Xilinx, and to the maximum extent permitted by applicable
+//  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+//  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+//  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+//  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+//  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+//  (2) Xilinx shall not be liable (whether in contract or tort,
+//  including negligence, or under any other theory of
+//  liability) for any loss or damage of any kind or nature
+//  related to, arising under or in connection with these
+//  materials, including for any direct, or any indirect,
+//  special, incidental, or consequential loss or damage
+//  (including loss of data, profits, goodwill, or any type of
+//  loss or damage suffered as a result of any action brought
+//  by a third party) even if such damage or loss was
+//  reasonably foreseeable or Xilinx had been advised of the
+//  possibility of the same.
+//
+//  CRITICAL APPLICATIONS
+//  Xilinx products are not designed or intended to be fail-
+//  safe, or for use in any application requiring fail-safe
+//  performance, such as life-support or safety devices or
+//  systems, Class III medical devices, nuclear facilities,
+//  applications related to the deployment of airbags, or any
+//  other applications that could lead to death, personal
+//  injury, or severe property or environmental damage
+//  (individually and collectively, "Critical
+//  Applications"). Customer assumes the sole risk and
+//  liability of any use of Xilinx products in Critical
+//  Applications, subject only to applicable laws and
+//  regulations governing limitations on product liability.
+//
+//  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+//  PART OF THIS FILE AT ALL TIMES.
+//-----------------------------------------------------------------------------
+//
+// Generic Functions used by AXIS-Interconnect and Infrastrucutre Modules
+//
+// Verilog-standard:  Verilog 2001
+//--------------------------------------------------------------------------
+// Global Parameters:
+//
+// Functions:
+//   f_clogb2
+//   f_gcd
+//   f_lcm
+//   f_get_tdata_indx
+//   f_get_tstrb_indx
+//   f_get_tkeep_indx
+//   f_get_tlast_indx
+//   f_get_tid_indx
+//   f_get_tdest_indx
+//   f_get_tuser_indx
+//   f_payload_width
+// Tasks:
+//   t_display_tdata_error
+//--------------------------------------------------------------------------
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Global Parameters
+///////////////////////////////////////////////////////////////////////////////
+// Define Signal Set indices
+localparam G_INDX_SS_TREADY = 0;
+localparam G_INDX_SS_TDATA  = 1;
+localparam G_INDX_SS_TSTRB  = 2;
+localparam G_INDX_SS_TKEEP  = 3;
+localparam G_INDX_SS_TLAST  = 4;
+localparam G_INDX_SS_TID    = 5;
+localparam G_INDX_SS_TDEST  = 6;
+localparam G_INDX_SS_TUSER  = 7;
+localparam G_MASK_SS_TREADY = 32'h1 << G_INDX_SS_TREADY;
+localparam G_MASK_SS_TDATA  = 32'h1 << G_INDX_SS_TDATA;
+localparam G_MASK_SS_TSTRB  = 32'h1 << G_INDX_SS_TSTRB;
+localparam G_MASK_SS_TKEEP  = 32'h1 << G_INDX_SS_TKEEP;
+localparam G_MASK_SS_TLAST  = 32'h1 << G_INDX_SS_TLAST;
+localparam G_MASK_SS_TID    = 32'h1 << G_INDX_SS_TID  ;
+localparam G_MASK_SS_TDEST  = 32'h1 << G_INDX_SS_TDEST;
+localparam G_MASK_SS_TUSER  = 32'h1 << G_INDX_SS_TUSER;
+
+// Task DRC error levels
+localparam G_TASK_SEVERITY_ERR   = 2;
+localparam G_TASK_SEVERITY_WARNING = 1;
+localparam G_TASK_SEVERITY_INFO    = 0;
+
+///////////////////////////////////////////////////////////////////////////////
+// BEGIN Functions
+///////////////////////////////////////////////////////////////////////////////
+// ceiling logb2
+  function integer f_clogb2 (input integer size);
+    integer s;
+    begin
+      s = size;
+      s = s - 1;
+      for (f_clogb2=1; s>1; f_clogb2=f_clogb2+1)
+            s = s >> 1;
+    end
+  endfunction // clogb2
+
+  // Calculates the Greatest Common Divisor between two integers using the
+  // euclidean algorithm.
+  function automatic integer f_gcd (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      integer A, B, done, swap;
+      A = a;
+      B = b;
+      done = 0;
+      while(!done)
+      begin
+        if (A < B ) begin
+          swap = A;
+          A = B;
+          B = swap;
+        end else if ( B != 0 ) begin
+          A = A - B;
+        end else begin
+          done = 1;
+        end
+      end
+
+      f_gcd = A;
+    end
+  endfunction
+
+
+  // Calculates the Lowest Common Denominator between two integers
+  function integer f_lcm (
+    input integer a,
+    input integer b
+    );
+    begin : main
+      f_lcm = ( a / f_gcd(a, b)) * b;
+    end
+  endfunction
+
+  // Returns back the index to the TDATA portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdata_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      f_get_tdata_indx = 0;
+    end
+  endfunction
+
+  // Returns back the index to the tstrb portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tstrb_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdata_indx(DAW, IDW, DEW, USW, SST);
+      // If TDATA exists, then add its width to its base to get the tstrb index
+      f_get_tstrb_indx = SST[G_INDX_SS_TDATA] ? cur_indx + DAW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tkeep portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tkeep_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tstrb_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tkeep_indx = SST[G_INDX_SS_TSTRB] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tlast portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tlast_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tkeep_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tlast_indx = SST[G_INDX_SS_TKEEP] ? cur_indx + DAW/8 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tid portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tid_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tlast_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tid_indx = SST[G_INDX_SS_TLAST] ? cur_indx + 1 : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tdest portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tdest_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tid_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tdest_indx = SST[G_INDX_SS_TID] ? cur_indx + IDW : cur_indx;
+    end
+  endfunction
+
+  // Returns back the index to the tuser portion of TPAYLOAD, returns 0 if the
+  // signal is not enabled.
+  function integer f_get_tuser_indx (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tdest_indx(DAW, IDW, DEW, USW, SST);
+      f_get_tuser_indx = SST[G_INDX_SS_TDEST] ? cur_indx + DEW : cur_indx;
+    end
+  endfunction
+
+  // Payload is the sum of all the AXIS signals present except for
+  // TREADY/TVALID
+  function integer f_payload_width (
+    input integer DAW,  // TDATA Width
+    input integer IDW,  // TID Width
+    input integer DEW,  // TDEST Width
+    input integer USW,  // TUSER Width
+    input [31:0]  SST   // Signal Set
+    );
+    begin : main
+      integer cur_indx;
+      cur_indx = f_get_tuser_indx(DAW, IDW, DEW, USW, SST);
+      f_payload_width = SST[G_INDX_SS_TUSER] ? cur_indx + USW : cur_indx;
+      // Ensure that the return value is never less than 1
+      f_payload_width = (f_payload_width < 1) ? 1 : f_payload_width;
+    end
+  endfunction
+
+  task t_check_tdata_width(
+    input  integer    data_width,
+    input  [8*80-1:0] var_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tdata_width
+      if (data_width%8 != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s (%2d) must be a multiple of 8.", var_name, data_width);
+        $display("       AXI4-Stream data width is only defined for byte multiples. See the ");
+        $display("       AMBA4 AXI4-Stream Protocol Specification v1.0 Section 2.1 for more");
+        $display("       information.");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
+
+  task t_check_tuser_width(
+    input  integer    tuser_width,
+    input  [8*80-1:0] tuser_name,
+    input  integer    tdata_width,
+    input  [8*80-1:0] tdata_name,
+    input  [8*80-1:0] inst_name,
+    input  integer    severity_lvl,
+    output integer    ret_val
+  );
+    // Severity levels:
+    // 0 = INFO
+    // 1 = WARNING
+    // 2 = ERROR
+    begin : t_check_tuser_width
+      integer tdata_bytes;
+      tdata_bytes = tdata_width/8;
+      if ((tuser_width%tdata_bytes) != 0) begin
+        //       000       1          2         3         4         5         6         7         8
+        //       012       0          0         0         0         0         0         0         0
+        if (severity_lvl >= 2) begin
+        $display("ERROR: %m::%s", inst_name);
+        end else if (severity_lvl == 1) begin
+        $display("WARNING: %m::%s", inst_name);
+        end else begin
+        $display("INFO: %m::%s", inst_name);
+        end
+        $display("       Parameter %s == %2d is not the recommended value of 'an integer ", tuser_name, tuser_width);
+        $display("       multiple of the width of the interface (%s == %2d) in bytes.'  AXI4-Stream", tdata_name, tdata_width);
+        $display("       TUSER width in this module is only defined when the TUSER is the");
+        $display("       recommended value.  See the AMBA4 AXI4-Stream Protocol Specification v1.0");
+        $display("       Section 2.1, 2.3.3 and 2.8 for more information.  ");
+        ret_val = 1;
+      end else begin
+        ret_val = 0;
+      end
+    end
+  endtask
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index f0bd5fbd0670e5088372383b16690ab67878334d..7b8a1bf6b83175cfda041cfc49a22273fd696d8e 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -48,6 +48,10 @@ from finn.transformation.fpgadataflow.minimize_accumulator_width import (
 class InferConvInpGen(Transformation):
     """Convert Im2Col layers to ConvolutionInputGenerator layers."""
 
+    def __init__(self, use_rtl_variant=False):
+        super().__init__()
+        self.use_rtl_variant = use_rtl_variant
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -113,8 +117,12 @@ class InferConvInpGen(Transformation):
                     ConvInpGen_idim_h = odim_padding_h
                     ConvInpGen_idim_w = odim_padding_w
 
+                    padding_optype = (
+                        "FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch"
+                    )
+
                     padding_node = helper.make_node(
-                        "FMPadding_Batch",
+                        padding_optype,
                         [i2c_input],
                         [padding_out],
                         domain="finn.custom_op.fpgadataflow",
@@ -128,105 +136,144 @@ class InferConvInpGen(Transformation):
                     )
                     graph.node.insert(node_ind, padding_node)
 
-                # Ensure that only supported HLS nodes are inserted
+                is_kernel_pointwise = k_h == 1 and k_w == 1
                 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w
                 is_square_kernel = k_h == k_w
-                is_kernel_pointwise = k_h == 1 and k_w == 1
                 is_equal_stride = stride_h == stride_w
                 is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or (
                     k_h > 1 and k_w == 1 and ifm_dim_w == 1
                 )
 
-                if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
-                    assert is_square_image, (
-                        "%s : DownSampler currently only supports square input images."
-                        % n.name
-                    )
-                    assert is_equal_stride, (
-                        """%s : DownSampler currently only supports equal stride value
-                        along different axes."""
-                        % n.name
-                    )
-                    ConvInpGen_idim = ConvInpGen_idim_h
-                    stride = stride_h
-                    # create DownSampler node
+                # Ensure that RTL variant is not inserted for unsupported configuration
+                is_rtl_variant_compatible = True
+                if is_kernel_pointwise:
+                    is_rtl_variant_compatible = False
+                    if self.use_rtl_variant:
+                        warnings.warn(
+                            """%s : RTL ConvInpGen requested for unsupported
+                                configuration. Falling back to HLS implementation."""
+                            % n.name
+                        )
+
+                if self.use_rtl_variant and is_rtl_variant_compatible:
+
                     ConvInpGen_node = helper.make_node(
-                        "DownSampler",
+                        "ConvolutionInputGenerator_rtl",
                         [ConvInpGen_input],
                         [i2c_output],
                         domain="finn.custom_op.fpgadataflow",
                         backend="fpgadataflow",
-                        ImgDim=ConvInpGen_idim,
-                        NumChannels=ifm_ch,
+                        ConvKernelDim=[k_h, k_w],
+                        IFMChannels=ifm_ch,
+                        IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                        OFMDim=[ofm_dim_h, ofm_dim_w],
                         SIMD=ifm_ch,
-                        Stride=stride,
+                        M=1,
+                        parallel_window=0,
+                        Stride=[stride_h, stride_w],
+                        Dilation=[dilation_h, dilation_w],
                         inputDataType=dt.name,
-                        name="DownSampler_" + n.name,
+                        outputDataType=dt.name,
+                        depthwise=depthwise,
+                        name="ConvolutionInputGenerator_rtl_" + n.name,
                     )
                     graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 else:
-                    # create equivalent ConvolutionInputGenerator node
-                    if (
-                        is_square_image and is_square_kernel
-                    ):  # square images and square kernels
-                        assert is_equal_stride, (
-                            """%s: Non-equal strides along different axes is not supported
-                            for (non-)square convolutions"""
-                            % n.name
-                        )
-                        assert dilation_h == 1 and dilation_w == 1, (
-                            """%s: Dilation value != 1 is not supported
-                            for square convolutions"""
-                            % n.name
+                    # Ensure that only supported HLS nodes are inserted
+                    if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise:
+                        downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1)
+                        is1D_unitx = ifm_dim_w == 1
+                        downsample_2D = (
+                            (not downsample_1D) and is_square_image and is_equal_stride
                         )
+                        if not (downsample_1D or downsample_2D):
+                            warnings.warn(
+                                f"Couldn't infer Downsample from {n.name},check config."
+                            )
+                            continue
+                        ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w)
+                        stride = max(stride_h, stride_w)
+                        # create DownSampler node
                         ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator",
+                            "DownSampler",
                             [ConvInpGen_input],
                             [i2c_output],
                             domain="finn.custom_op.fpgadataflow",
                             backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
+                            ImgDim=ConvInpGen_idim,
+                            NumChannels=ifm_ch,
                             SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
+                            Stride=stride,
                             inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator_" + n.name,
+                            name="DownSampler_" + n.name,
+                            is1D=downsample_1D,
+                            is1D_unitx=is1D_unitx,
                         )
-                    else:  # 1D images and/or kernels
-                        assert is_1d_convolution, (
-                            "%s: ConvolutionInputGenerator1D works only for 1D convs"
-                            % n.name
-                        )
-                        if dilation_h > 1 or dilation_w > 1:
-                            assert depthwise == 1, (
-                                """%s: Dilation value > 1 is only supported for
-                                1D depthwise separable convolutions"""
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                    else:
+                        # create equivalent ConvolutionInputGenerator node
+                        if (
+                            is_square_image and is_square_kernel
+                        ):  # square images and square kernels
+                            assert is_equal_stride, (
+                                """%s: Non-equal strides along different axes is not supported
+                                for (non-)square convolutions"""
                                 % n.name
                             )
-                        ConvInpGen_node = helper.make_node(
-                            "ConvolutionInputGenerator1D",
-                            [ConvInpGen_input],
-                            [i2c_output],
-                            domain="finn.custom_op.fpgadataflow",
-                            backend="fpgadataflow",
-                            ConvKernelDim=[k_h, k_w],
-                            IFMChannels=ifm_ch,
-                            IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
-                            OFMDim=[ofm_dim_h, ofm_dim_w],
-                            SIMD=ifm_ch,
-                            Stride=[stride_h, stride_w],
-                            Dilation=[dilation_h, dilation_w],
-                            inputDataType=dt.name,
-                            outputDataType=dt.name,
-                            depthwise=depthwise,
-                            name="ConvolutionInputGenerator1D_" + n.name,
-                        )
-                    graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
+                            assert dilation_h == 1 and dilation_w == 1, (
+                                """%s: Dilation value != 1 is not supported
+                                for square convolutions"""
+                                % n.name
+                            )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator_" + n.name,
+                            )
+                        else:  # 1D images and/or kernels
+                            assert is_1d_convolution, (
+                                """%s: ConvolutionInputGenerator1D works only
+                                for 1D convs"""
+                                % n.name
+                            )
+                            if dilation_h > 1 or dilation_w > 1:
+                                assert depthwise == 1, (
+                                    """%s: Dilation value > 1 is only supported for
+                                    1D depthwise separable convolutions"""
+                                    % n.name
+                                )
+                            ConvInpGen_node = helper.make_node(
+                                "ConvolutionInputGenerator1D",
+                                [ConvInpGen_input],
+                                [i2c_output],
+                                domain="finn.custom_op.fpgadataflow",
+                                backend="fpgadataflow",
+                                ConvKernelDim=[k_h, k_w],
+                                IFMChannels=ifm_ch,
+                                IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w],
+                                OFMDim=[ofm_dim_h, ofm_dim_w],
+                                SIMD=ifm_ch,
+                                Stride=[stride_h, stride_w],
+                                Dilation=[dilation_h, dilation_w],
+                                inputDataType=dt.name,
+                                outputDataType=dt.name,
+                                depthwise=depthwise,
+                                name="ConvolutionInputGenerator1D_" + n.name,
+                            )
+                        graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
@@ -285,20 +332,25 @@ class InferUpsample(Transformation):
                 )
 
                 # Assumes nhwc layout for scales and input
-                assert scales[1] == scales[2], (
-                    "%s: Upsampling is only supported for quadratic scales." % n.name
+                is_scale_square_2d = scales[1] == scales[2]
+                is_scale_1d = scales[1] > 1 and scales[2] == 1
+                assert is_scale_square_2d or is_scale_1d, (
+                    "%s: Upsampling only supported for 1D H, or 2D square scaling"
+                    % n.name
                 )
                 assert scales[0] == scales[3] == 1, (
                     n.name + ": Upsampling is only supported for scales with "
-                    "the first and last dimensions being 1."
+                    "the first and last dimensions being 1 in NHWC."
                 )
                 spatial_scale = scales[1]
                 assert spatial_scale == int(spatial_scale), (
                     "%s: Upsampling is only supported for integer scales." % n.name
                 )
+                is_shape_square_2d = in_shape[1] == in_shape[2]
+                is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1
 
-                assert in_shape[1] == in_shape[2], (
-                    "%s: Upsampling is only supported for quadratic input shapes."
+                assert is_shape_square_2d or is_shape_1d, (
+                    "%s: Upsampling is only supported for 1D H or 2D square inputs."
                     % n.name
                 )
 
@@ -308,6 +360,7 @@ class InferUpsample(Transformation):
                 NumChannels = in_shape[-1]
                 numInputVectors = in_shape[0]
                 inputDataType = dt.name
+                dim_mode = 0 if is_shape_square_2d else 1
 
                 # Insert the HLSCustomOp node
                 Upsample_HLS_node = helper.make_node(
@@ -321,6 +374,7 @@ class InferUpsample(Transformation):
                     NumChannels=NumChannels,
                     inputDataType=inputDataType,
                     numInputVectors=numInputVectors,
+                    DimMode=dim_mode,
                     name="UpsampleNearestNeighbour_Batch_" + n.name,
                 )
 
@@ -863,6 +917,10 @@ class InferVectorVectorActivation(Transformation):
     a depthwise convolution. Any immediately following MultiThreshold
     layers will also be absorbed into the VVAU."""
 
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -963,6 +1021,7 @@ class InferVectorVectorActivation(Transformation):
                             ActVal=actval,
                             noActivation=0,
                             name="VectorVectorActivation_" + n.name,
+                            mem_mode=self.mem_mode,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -1110,10 +1169,16 @@ class InferAddStreamsLayer(Transformation):
                 result = node.output[0]
                 in0_shape = model.get_tensor_shape(in0)
                 in1_shape = model.get_tensor_shape(in1)
+                in0_static = not (model.get_initializer(in0) is None)
+                in1_static = not (model.get_initializer(in1) is None)
 
                 # skip if different shapes on inputs
                 if in0_shape != in1_shape:
                     continue
+                # skip if any of inputs have initializers
+                # (this node is meant for adding two dynamic streams)
+                if in0_static or in1_static:
+                    continue
 
                 idt0 = model.get_tensor_datatype(in0)
                 idt1 = model.get_tensor_datatype(in1)
@@ -1227,6 +1292,7 @@ class InferDuplicateStreamsLayer(Transformation):
                     inputDataType=dt.name,
                     numInputVectors=vecs,
                     NumOutputStreams=n_outputs,
+                    outFIFODepths=[2] * n_outputs,
                     name="DuplicateStreams_Batch_" + node.name,
                 )
 
@@ -1638,6 +1704,10 @@ class InferConcatLayer(Transformation):
                 )
                 if not dt_coherent:
                     continue
+                # skip conversion if any inputs are static
+                all_static = all([model.get_initializer(x) is None for x in node.input])
+                if not all_static:
+                    continue
                 # skip conversion if inputs are not integers
                 if not dt0.is_integer():
                     continue
@@ -1654,6 +1724,7 @@ class InferConcatLayer(Transformation):
                     ElemsPerStream=elems_per_stream,
                     inputDataType=dt0.name,
                     numInputVectors=inp_vec,
+                    inFIFODepths=[2] * len(node.input),
                 )
                 graph.node.insert(node_ind, new_node)
                 # remove old node
@@ -1664,3 +1735,101 @@ class InferConcatLayer(Transformation):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferStreamingEltwise(Transformation):
+    """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
+    with SubEltwise or AbsDiffEltwise op."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Sub":
+                in0 = node.input[0]
+                in1 = node.input[1]
+                result = node.output[0]
+                in0_shape = model.get_tensor_shape(in0)
+                in1_shape = model.get_tensor_shape(in1)
+                in0_static = not (model.get_initializer(in0) is None)
+                in1_static = not (model.get_initializer(in1) is None)
+
+                # skip if different shapes on inputs
+                if in0_shape != in1_shape:
+                    continue
+                # skip if any of inputs have initializers
+                # (this node is meant for two dynamic streams)
+                if in0_static or in1_static:
+                    continue
+
+                idt0 = model.get_tensor_datatype(in0)
+                idt1 = model.get_tensor_datatype(in1)
+
+                # skip conversion for layers with float input
+                if not (idt0.is_integer() and idt1.is_integer()):
+                    continue
+
+                eltwiseOp = "Sub"
+                nodes_to_remove = [node]
+                # look for a downstream Abs node
+                res_consumer = model.find_consumer(result)
+                if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
+                    eltwiseOp = "AbsDiff"
+                    result = res_consumer.output[0]
+                    nodes_to_remove.append(res_consumer)
+
+                # check layout and convert if necessary
+                in0_layout = model.get_tensor_layout(in0)
+                in1_layout = model.get_tensor_layout(in1)
+                result_layout = model.get_tensor_layout(result)
+
+                if in0_layout == DataLayout.NCHW:
+                    in0 = nchw_to_nhwc(in0, model, node_ind)
+                    node_ind += 1
+                    in0_shape = model.get_tensor_shape(in0)
+
+                if in1_layout == DataLayout.NCHW:
+                    in1 = nchw_to_nhwc(in1, model, node_ind)
+                    node_ind += 1
+                    in1_shape = model.get_tensor_shape(in1)
+
+                # keep track of where we need to insert the HLS Op
+                # it has to be ahead of the output transform
+                insert_point = node_ind
+
+                if result_layout == DataLayout.NCHW:
+                    result = nchw_to_nhwc(result, model, node_ind, reverse=True)
+                    node_ind += 1
+
+                # now safe to assume num_channels is size of last dimension
+                num_channels = int(in0_shape[-1])
+                # create node with no parallelization first
+                pe = 1
+
+                # create and insert new Eltwise node
+                new_node = helper.make_node(
+                    "StreamingEltwise",
+                    [in0, in1],
+                    [result],
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    NumChannels=num_channels,
+                    PE=pe,
+                    inputDataType0=idt0.name,
+                    inputDataType1=idt1.name,
+                    eltwiseOp=eltwiseOp,
+                    numInputVectors=in0_shape[:-1],
+                    name="StreamingEltwise_" + node.name,
+                )
+                graph.node.insert(insert_point, new_node)
+                # remove old nodes
+                for nd in nodes_to_remove:
+                    graph.node.remove(nd)
+                graph_modified = True
+
+        # if graph_modified:
+        # model = model.transform(InferShapes())
+        # model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 892ab09fdf41947f86e2bf122e057e94585dfa8c..8e2c69bad4b0a6749c605bea9ee21d6408c904c0 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -228,6 +228,22 @@ class CreateStitchedIP(Transformation):
             )
             self.s_axis_idx += 1
 
+    def connect_ap_none_external(self, node):
+        inst_name = node.name
+        node_inst = getCustomOp(node)
+        input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"]
+        # make external
+        for i in range(len(input_intf_names)):
+            input_intf_name = input_intf_names[i]
+            self.connect_cmds.append(
+                "make_bd_pins_external [get_bd_pins %s/%s]"
+                % (inst_name, input_intf_name)
+            )
+            self.connect_cmds.append(
+                "set_property name %s [get_bd_ports %s_0]"
+                % (input_intf_name, input_intf_name)
+            )
+
     def insert_signature(self, checksum_count):
         signature_vlnv = "AMD:user:axi_info_top:1.0"
         signature_name = "axi_info_top0"
@@ -275,7 +291,7 @@ class CreateStitchedIP(Transformation):
             "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name
         )
         self.connect_cmds.append(
-            "set_property name s_axis_info [get_bd_intf_ports s_axi_0]"
+            "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]"
         )
         self.connect_cmds.append("assign_bd_address")
 
@@ -294,6 +310,14 @@ class CreateStitchedIP(Transformation):
                 behavior. It is strongly recommended to insert FIFOs prior to
                 calling CreateStitchedIP."""
             )
+        if model.graph.node[0].op_type == "StreamingFIFO":
+            firstfifo = getCustomOp(model.graph.node[0])
+            if firstfifo.get_nodeattr("impl_style") == "vivado":
+                warnings.warn(
+                    """First FIFO has impl_style=vivado, which may cause
+                    simulation glitches (e.g. dropping the first input sample
+                    after reset)."""
+                )
         for node in model.graph.node:
             # ensure that all nodes are fpgadataflow, and that IPs are generated
             assert is_fpgadataflow_node(
@@ -305,6 +329,7 @@ class CreateStitchedIP(Transformation):
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
             self.connect_clk_rst(node)
+            self.connect_ap_none_external(node)
             self.connect_axi(node)
             for i in range(len(node.input)):
                 if not is_external_input(model, node, i):
@@ -387,6 +412,7 @@ class CreateStitchedIP(Transformation):
         wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
         tcl.append("add_files -norecurse %s" % wrapper_filename)
         model.set_metadata_prop("wrapper_filename", wrapper_filename)
+        tcl.append("set_property top finn_design_wrapper [current_fileset]")
         # synthesize to DCP and export stub, DCP and constraints
         if self.vitis:
             tcl.append(
@@ -565,6 +591,10 @@ class CreateStitchedIP(Transformation):
             if os.path.isfile(wrapper_filename_alt):
                 model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
             else:
-                raise Exception("CreateStitchedIP failed, no wrapper HDL found.")
+                raise Exception(
+                    """CreateStitchedIP failed, no wrapper HDL found under %s or %s.
+                    Please check logs under the parent directory."""
+                    % (wrapper_filename, wrapper_filename_alt)
+                )
 
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py
new file mode 100644
index 0000000000000000000000000000000000000000..67eb96995ef3312dff72799c905216b82b7ef8ee
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import qonnx.custom_op.registry as registry
+import warnings
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.base import NodeLocalTransformation
+
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
+
+class DeriveCharacteristic(NodeLocalTransformation):
+    """For each node in the graph, run rtlsim to obtain the i/o
+    characteristic function for FIFO sizing and set the attribute.
+    It is assumed that the PrepareRTLSim transformation was already
+    called on the graph.
+
+    This transformation performs rtlsim for each node, so it will run for
+    some time (minutes to hours depending on configuration).
+
+    * period (int) desired period over which the characteristic function
+      will be derived.
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, period, num_workers=None, manual_bypass=False):
+        super().__init__(num_workers=num_workers)
+        self.period = period
+        self.manual_bypass = manual_bypass
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                inst = registry.getCustomOp(node)
+                inst.derive_characteristic_fxns(period=self.period)
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception(
+                    "Custom op_type %s is currently not supported." % op_type
+                )
+        return (node, False)
+
+    def apply(self, model: ModelWrapper):
+        (model, run_again) = super().apply(model)
+        if not self.manual_bypass:
+            return (model, run_again)
+        # apply manual fix for DuplicateStreams and AddStreams for
+        # simple residual reconvergent paths with bypass
+        addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch")
+        for addstrm_node in addstrm_nodes:
+            # we currently only support the case where one branch is
+            # a bypass
+            b0 = model.find_producer(addstrm_node.input[0])
+            b1 = model.find_producer(addstrm_node.input[1])
+            if (b0 is None) or (b1 is None):
+                warnings.warn("Found unsupported AddStreams, skipping")
+                return (model, run_again)
+            b0_is_bypass = b0.op_type == "DuplicateStreams_Batch"
+            b1_is_bypass = b1.op_type == "DuplicateStreams_Batch"
+            if (not b0_is_bypass) and (not b1_is_bypass):
+                warnings.warn("Found unsupported AddStreams, skipping")
+                return (model, run_again)
+            ds_node = b0 if b0_is_bypass else b1
+            comp_branch_last = b1 if b0_is_bypass else b0
+
+            ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1]
+            comp_branch_first = model.find_consumer(ds_comp_bout)
+            if comp_branch_first is None or comp_branch_last is None:
+                warnings.warn("Found unsupported DuplicateStreams, skipping")
+                return (model, run_again)
+            comp_branch_last = registry.getCustomOp(comp_branch_last)
+            comp_branch_first = registry.getCustomOp(comp_branch_first)
+            # for DuplicateStreams, use comp_branch_first's input characterization
+            # for AddStreams, use comp_branch_last's output characterization
+            period = comp_branch_first.get_nodeattr("io_chrc_period")
+            comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[
+                : 2 * period
+            ]
+            comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[
+                2 * period :
+            ]
+            ds_node_inst = registry.getCustomOp(ds_node)
+            addstrm_node_inst = registry.getCustomOp(addstrm_node)
+            ds_node_inst.set_nodeattr("io_chrc_period", period)
+            ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2)
+            addstrm_node_inst.set_nodeattr("io_chrc_period", period)
+            addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2)
+            warnings.warn(
+                f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}"
+            )
+            warnings.warn(
+                f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}"
+            )
+        return (model, run_again)
+
+
+class DeriveFIFOSizes(NodeLocalTransformation):
+    """Prerequisite: DeriveCharacteristic already called on graph.
+    For each node in the graph, use the accumulated I/O characteristic function
+    to perform FIFO sizing, setting the in/outFIFODepths attributes of HLSCustomOp
+    nodes.
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, num_workers=None, io_fifo_depth=32):
+        super().__init__(num_workers=num_workers)
+        self.io_fifo_depth = io_fifo_depth
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_fpgadataflow_node(node) is True:
+            try:
+                # lookup op_type in registry of CustomOps
+                prod = registry.getCustomOp(node)
+                assert op_type != "StreamingFIFO", "Found existing FIFOs"
+                period = prod.get_nodeattr("io_chrc_period")
+                prod_chrc = prod.get_nodeattr("io_chrc_out")[0]
+                assert (
+                    len(prod_chrc) == 2 * period
+                ), "Found unexpected characterization attribute"
+                if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]):
+                    # FIFO depth already set, can skip this node
+                    return (node, False)
+
+                # find consumers
+                model = self.ref_input_model
+                out_fifo_depths = []
+                for output_name in node.output:
+                    cons_node = model.find_consumer(output_name)
+                    if cons_node is None:
+                        # could be final node, will be overridden if so
+                        # need an entry in the list anyway
+                        out_fifo_depths.append(self.io_fifo_depth)
+                        continue
+                    cons = registry.getCustomOp(cons_node)
+                    cons_chrc = cons.get_nodeattr("io_chrc_in")[0]
+                    # find minimum phase shift satisfying the constraint
+                    pshift_min = period - 1
+                    for pshift_cand in range(period):
+                        prod_chrc_part = prod_chrc[pshift_cand:period]
+                        cons_chrc_part = cons_chrc[: period - pshift_cand]
+                        if (prod_chrc_part >= cons_chrc_part).all():
+                            pshift_min = pshift_cand
+                            break
+                    prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period)]
+                    cons_chrc_part = cons_chrc[:period]
+                    fifo_depth = int((prod_chrc_part - cons_chrc_part).max())
+                    out_fifo_depths.append(fifo_depth)
+                # set output FIFO depth for this (producing) node
+                # InsertFIFO looks at the max of (outFIFODepths, inFIFODepths)
+                # for each tensor
+                prod.set_nodeattr("outFIFODepths", out_fifo_depths)
+
+                # finally, check node inputs to ensure FIFOs are added to
+                # any top-level inputs (at least self.io_fifo_depth deep)
+                in_fifo_depths = prod.get_nodeattr("inFIFODepths")
+                for (i, input_name) in enumerate(node.input):
+                    if input_name in [x.name for x in model.graph.input]:
+                        in_fifo_depths[i] = max(self.io_fifo_depth, in_fifo_depths[i])
+                prod.set_nodeattr("inFIFODepths", in_fifo_depths)
+
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception(
+                    "Custom op_type %s is currently not supported." % op_type
+                )
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index 67920172231e685a4f5dd72f037f64fe6baf8449..549b94d9f287721aac26afd4d4d832e48adadb84 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -151,6 +151,7 @@ class Floorplan(Transformation):
                 node_inst.set_nodeattr("partition_id", partition_cnt)
                 partition_cnt += 1
                 continue
+
             elif not (
                 node.op_type == "MatrixVectorActivation"
                 and node_inst.get_nodeattr("mem_mode") is not None
@@ -165,9 +166,17 @@ class Floorplan(Transformation):
                 pre_inst = getCustomOp(pre_node)
                 pre_slr = pre_inst.get_nodeattr("slr")
                 if node_slr == pre_slr:
-                    partition_id = pre_inst.get_nodeattr("partition_id")
-                    node_inst.set_nodeattr("partition_id", partition_id)
-                    break
+                    axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()[
+                        "axilite"
+                    ]
+                    if len(axilite_intf_name) != 0:
+                        node_inst.set_nodeattr("partition_id", partition_cnt)
+                        partition_cnt += 1
+                    else:
+                        partition_id = pre_inst.get_nodeattr("partition_id")
+                        node_inst.set_nodeattr("partition_id", partition_id)
+                break
+
             else:
                 # no matching, new partition
                 node_inst.set_nodeattr("partition_id", partition_cnt)
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
index 1fede0667888ee9059cfb2e7f5db00b6bb3f4259..c091dbd5edc675234686b28048c004b26c3fc131 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
@@ -64,7 +64,11 @@ class HLSSynthIP(NodeLocalTransformation):
                 ), """Node
                 attribute "code_gen_dir_ipgen" is empty. Please run
                 transformation PrepareIP first."""
-                if not os.path.isdir(inst.get_nodeattr("ipgen_path")):
+                if not os.path.isdir(
+                    inst.get_nodeattr("ipgen_path")
+                ) or not inst.get_nodeattr("code_gen_dir_ipgen") in inst.get_nodeattr(
+                    "ipgen_path"
+                ):
                     # call the compilation function for this node
                     inst.ipgen_singlenode_code()
                 else:
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 9817f2e3d2857bd5e59b304fbdaf3bad74a9b037..632d1f813b4d2509407930bc9294f7531d4c90af 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -81,6 +81,15 @@ class InsertDWC(Transformation):
                             dwc_in_width = n0.get_outstream_width()
                             # determine dwc outwidth
                             dwc_out_width = n1.get_instream_width()
+                            larger_width = max(dwc_in_width, dwc_out_width)
+                            smaller_width = min(dwc_in_width, dwc_out_width)
+                            both_8bit_aligned = (larger_width % 8 == 0) and (
+                                smaller_width % 8 == 0
+                            )
+                            if both_8bit_aligned:
+                                impl_style = "vivado"
+                            else:
+                                impl_style = "hls"
 
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
@@ -105,6 +114,7 @@ class InsertDWC(Transformation):
                                 inWidth=dwc_in_width,
                                 outWidth=dwc_out_width,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
                             )
                             # insert dwc
                             graph.node.insert(node_ind + 1, dwc_node)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 78200b280960ad53e3e84d44394c10296c432ba5..bfeee95e9bbd2a3a3f7c6eb0a4c7e74d30f76228 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -67,19 +67,31 @@ class InsertFIFO(Transformation):
     between fpgadataflow nodes.
 
     Takes the setting for the depth from the surrounding nodes by extracting
-    node attribute 'outFIFODepth' of the previous and node attribute 'inFIFODepth'
+    node attribute 'outFIFODepths' of the previous and node attribute 'inFIFODepths'
     of the subsequent node. max() of these two values sets the FIFO depth.
 
-    Normally, shallow-depth (<=2) FIFOs won't be created since HLS streaming
-    interfaces already have a degree of buffering. You can set
-    create_shallow_fifos=True to override this default behavior.
+    Constructor arguments:
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute
+        to be used for large FIFOs implemented by Vivado
+    :parameter create_shallow_fifos: Normally, shallow-depth (<=2) FIFOs
+        won't be created since HLS streaming interfaces
+        already have a degree of buffering.
+        Override with this parameter.
+
 
     The other node attributes necessary to create a FIFO node are taken from the
     node the FIFO node is inserted after: 'folded_shape' and 'dtype'"""
 
-    def __init__(self, create_shallow_fifos=False):
+    def __init__(
+        self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto"
+    ):
         super().__init__()
         self.create_shallow_fifos = create_shallow_fifos
+        self.max_qsrl_depth = max_qsrl_depth
+        self.vivado_ram_style = vivado_ram_style
 
     def apply(self, model):
         graph = model.graph
@@ -88,8 +100,8 @@ class InsertFIFO(Transformation):
         for first_node in graph.node:
             node_ind += 1
             if _suitable_node(first_node):
-                for n_output in first_node.output:
-                    consumers = model.find_consumers(n_output)
+                for idx_out, output_name in enumerate(first_node.output):
+                    consumers = model.find_consumers(output_name)
                     if consumers == []:
                         continue
                     if len(consumers) > 1:
@@ -108,11 +120,9 @@ class InsertFIFO(Transformation):
                         # input of the second node is equal
                         n1 = getCustomOp(consumer)
                         for idx, inp in enumerate(consumer.input):
-                            if inp == n_output:
-                                if idx == 0:
-                                    fld_shape_2 = n1.get_folded_input_shape()
-                                else:
-                                    fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                            if inp == output_name:
+                                fld_shape_2 = n1.get_folded_input_shape(ind=idx)
+                                idx_inp = idx
                         assert _suitable_folded_shapes(
                             fld_shape, fld_shape_2
                         ), """The
@@ -120,14 +130,12 @@ class InsertFIFO(Transformation):
                         folded output shape of the second node. A streaming fifo can't
                         be implemented in between these nodes."""
 
-                        # check if outFIFOdepth attribute of first node
-                        # and inFIFOdepth attribute of consumer node is equal
-                        n0_depth = n0.get_nodeattr("outFIFODepth")
-                        n1_depth = n1.get_nodeattr("inFIFODepth")
-                        if n0_depth == n1_depth:
-                            fifo_depth = n0_depth
-                        elif n0_depth != n1_depth:
-                            fifo_depth = max(n0_depth, n1_depth)
+                        # check if outFIFOdepths attribute of first node
+                        # and inFIFOdepths attribute of consumer node is equal
+                        n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out]
+                        n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp]
+
+                        fifo_depth = max(n0_depth, n1_depth)
 
                         if fifo_depth > 2 or self.create_shallow_fifos:
                             # assumption: HLS streaming components already have
@@ -143,25 +151,35 @@ class InsertFIFO(Transformation):
                             graph.value_info.append(fifo_output_tensor)
                             model.set_tensor_datatype(fifo_output_tensor.name, dtype)
 
+                            if (
+                                self.max_qsrl_depth is None
+                                or fifo_depth <= self.max_qsrl_depth
+                            ):
+                                impl_style = "rtl"
+                            else:
+                                impl_style = "vivado"
+
                             fifo_node = oh.make_node(
                                 "StreamingFIFO",
-                                [n_output],
+                                [output_name],
                                 [fifo_output_tensor.name],
                                 domain="finn.custom_op.fpgadataflow",
                                 backend="fpgadataflow",
                                 depth=fifo_depth,
                                 folded_shape=fld_shape,
                                 dataType=str(dtype.name),
+                                impl_style=impl_style,
+                                ram_style=self.vivado_ram_style,
                             )
                             # insert fifo
                             graph.node.insert(node_ind + 1, fifo_node)
                             # set fifo output tensor as new input tensor of second node
                             for idx, inp in enumerate(consumer.input):
-                                if inp == n_output:
+                                if inp == output_name:
                                     consumer.input[idx] = fifo_output_tensor.name
-                            # ensure created FIFO depth is reflected on both sides
-                            n0.set_nodeattr("outFIFODepth", fifo_depth)
-                            n1.set_nodeattr("inFIFODepth", fifo_depth)
+                            # removed setting of node attributes based on created
+                            # FIFO sizes here, better to preserve original attrs
+                            # as they are.
                             graph_modified = True
 
         if graph_modified is False:
@@ -177,42 +195,48 @@ class InsertFIFO(Transformation):
                     n_input = first_node.input[inp_ind]
                     n0 = getCustomOp(first_node)
                     # determine fifo node attributes
-                    if inp_ind == 0:
-                        fld_shape = n0.get_folded_input_shape()
-                        dtype = n0.get_input_datatype()
+                    fld_shape = n0.get_folded_input_shape(inp_ind)
+                    dtype = n0.get_input_datatype(inp_ind)
+                    fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
+
+                    if fifo_depth > 2 or self.create_shallow_fifos:
+                        # create fifo node
+                        fifo_output_tensor = oh.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            n0.get_normal_input_shape(),
+                        )
+                        graph.value_info.append(fifo_output_tensor)
+                        model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                        # only use rtl-style FIFOs to avoid simulation bug
+                        # (top-level IOs should not have impl_style=vivado)
+                        impl_style = "rtl"
+
+                        fifo_node = oh.make_node(
+                            "StreamingFIFO",
+                            [n_input],
+                            [fifo_output_tensor.name],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=str(dtype.name),
+                            impl_style=impl_style,
+                            ram_style=self.vivado_ram_style,
+                        )
+                        # insert fifo
+                        graph.node.insert(0, fifo_node)
+
+                        # set fifo output tensor as new input tensor of second node
+                        first_node.input[inp_ind] = fifo_output_tensor.name
                     else:
-                        fld_shape = n0.get_folded_input_shape(inp_ind)
-                        dtype = n0.get_input_datatype(inp_ind)
-                    fifo_depth = n0.get_nodeattr("inFIFODepth")
-
-                    if fifo_depth <= 2:
-                        warnings.warn("Overriding input FIFO depth to 32")
-                        fifo_depth = 32
-
-                    # create fifo node
-                    fifo_output_tensor = oh.make_tensor_value_info(
-                        model.make_new_valueinfo_name(),
-                        TensorProto.FLOAT,
-                        n0.get_normal_input_shape(),
-                    )
-                    graph.value_info.append(fifo_output_tensor)
-                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                    fifo_node = oh.make_node(
-                        "StreamingFIFO",
-                        [n_input],
-                        [fifo_output_tensor.name],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        depth=fifo_depth,
-                        folded_shape=fld_shape,
-                        dataType=str(dtype.name),
-                    )
-                    # insert fifo
-                    graph.node.insert(0, fifo_node)
-
-                    # set fifo output tensor as new input tensor of second node
-                    first_node.input[inp_ind] = fifo_output_tensor.name
+                        warnings.warn(
+                            """Input FIFO for %s has depth %d and won't
+                        be created. This may cause RTL simulation issues.
+                        """
+                            % (graph_in_name, fifo_depth)
+                        )
 
             # insert FIFO as last node, except when last node is DMA
             graph_out_names = [x.name for x in model.graph.output]
@@ -227,38 +251,49 @@ class InsertFIFO(Transformation):
                     ), """Insert tlast marker should be done
                         after inserting the FIFOs"""
                     n0 = getCustomOp(final_node)
+                    out_ind = list(final_node.output).index(graph_out_name)
                     # determine fifo node attributes
-                    fld_shape = n0.get_folded_output_shape()
-                    dtype = n0.get_output_datatype()
-                    fifo_depth = n0.get_nodeattr("outFIFODepth")
-
-                    if fifo_depth <= 2:
-                        warnings.warn("Overriding output FIFO depth to 32")
-                        fifo_depth = 32
-
-                    # create fifo node
-                    fifo_input_tensor = oh.make_tensor_value_info(
-                        model.make_new_valueinfo_name(),
-                        TensorProto.FLOAT,
-                        n0.get_normal_output_shape(),
-                    )
-                    graph.value_info.append(fifo_input_tensor)
-                    model.set_tensor_datatype(fifo_input_tensor.name, dtype)
-
-                    fifo_node = oh.make_node(
-                        "StreamingFIFO",
-                        [fifo_input_tensor.name],
-                        [graph_out_name],
-                        domain="finn.custom_op.fpgadataflow",
-                        backend="fpgadataflow",
-                        depth=fifo_depth,
-                        folded_shape=fld_shape,
-                        dataType=str(dtype.name),
-                    )
-                    # insert fifo
-                    graph.node.append(fifo_node)
-
-                    # set fifo output tensor as new input tensor of second node
-                    final_node.output[0] = fifo_input_tensor.name
+                    fld_shape = n0.get_folded_output_shape(out_ind)
+                    dtype = n0.get_output_datatype(out_ind)
+                    fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
+
+                    if fifo_depth > 2 or self.create_shallow_fifos:
+                        # create fifo node
+                        fifo_input_tensor = oh.make_tensor_value_info(
+                            model.make_new_valueinfo_name(),
+                            TensorProto.FLOAT,
+                            n0.get_normal_output_shape(),
+                        )
+                        graph.value_info.append(fifo_input_tensor)
+                        model.set_tensor_datatype(fifo_input_tensor.name, dtype)
+
+                        # only use rtl-style FIFOs to avoid simulation bug
+                        # (top-level IOs should not have impl_style=vivado)
+                        impl_style = "rtl"
+
+                        fifo_node = oh.make_node(
+                            "StreamingFIFO",
+                            [fifo_input_tensor.name],
+                            [graph_out_name],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=str(dtype.name),
+                            impl_style=impl_style,
+                            ram_style=self.vivado_ram_style,
+                        )
+                        # insert fifo
+                        graph.node.append(fifo_node)
+
+                        # set fifo output tensor as new input tensor of second node
+                        final_node.output[0] = fifo_input_tensor.name
+                    else:
+                        warnings.warn(
+                            """Output FIFO for %s has depth %d and won't
+                        be created. This may cause RTL simulation issues.
+                        """
+                            % (graph_out_name, fifo_depth)
+                        )
 
         return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 4b4eb6362faf641def057afadfa7b5e019f54698..28bcd9598af34072cc854fdf23778bef778bd985 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -211,7 +211,8 @@ class InsertIODMA(Transformation):
             # attached IODMA
             fc_extw_nodes = list(
                 filter(
-                    lambda x: x.op_type == "MatrixVectorActivation"
+                    lambda x: x.op_type
+                    in ["MatrixVectorActivation", "VectorVectorActivation"]
                     and getCustomOp(x).get_nodeattr("mem_mode") == "external"
                     and model.find_producer(x.input[1]) is None,
                     all_nodes,
@@ -259,6 +260,10 @@ class InsertIODMA(Transformation):
                 )
                 fc_node.input[1] = fc_node_in.name
                 model.graph.node.insert(0, dma_node)
+                # expand inFIFODepths for new second input of node
+                infifo_depth = fc_inst.get_nodeattr("inFIFODepths")
+                infifo_depth.append(8)
+                fc_inst.set_nodeattr("inFIFODepths", infifo_depth)
                 modified = True
         if modified:
             model = model.transform(SortGraph())
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index a589cb039c825ff97c11df7ffa57109df27f3fd0..f48566326e576f4d39d81359fe7f28a12645a635 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -45,7 +45,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.util.basic import make_build_dir, pynq_part_map
+from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map
 
 from . import templates
 
@@ -320,6 +320,7 @@ class ZynqBuild(Transformation):
     ):
         super().__init__()
         self.fpga_part = pynq_part_map[platform]
+        self.axi_port_width = pynq_native_port_width[platform]
         self.period_ns = period_ns
         self.platform = platform
         self.enable_debug = enable_debug
@@ -330,7 +331,7 @@ class ZynqBuild(Transformation):
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [
-            InsertIODMA(64),
+            InsertIODMA(self.axi_port_width),
             InsertDWC(),
             Floorplan(),
             CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 0139c71666fdfa4b60cb356ceb65ce2c5b831c13..35e7b9e6c929587d00038650742edb5dcb922130 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -29,10 +29,16 @@
 import math
 import numpy as np
 import warnings
+from onnx import TensorProto, helper
 from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk
+from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
-from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+)
 
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
@@ -42,7 +48,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.util.fpgadataflow import is_fpgadataflow_node
-from finn.util.pyverilator import pyverilate_stitched_ip
+from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim
 
 
 def reset_implementation(node):
@@ -72,8 +78,9 @@ def optimize_depth(depth):
         # Q_srl FIFOs do not benefit from size < 32
         # add some slack
         return 32
-    # round to nearest power of two for Vivado IP FIFO implementation
-    return int(2 ** math.ceil(math.log2(depth)))
+    # otherwise leave as is
+    # will be rounded to nearest power of two for Vivado-style FIFO
+    return int(depth)
 
 
 class RemoveShallowFIFOs(Transformation):
@@ -125,14 +132,17 @@ class CapConvolutionFIFODepths(Transformation):
     constructor flag is set.
 
     Constructor arguments:
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
+
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - FIFOs inserted with InsertAndSetFIFODepths
 
     Output:
+
     - graph with smaller-depth FIFOs for convolutions
 
     Background:
@@ -188,21 +198,25 @@ class InsertAndSetFIFODepths(Transformation):
     throughput in the created accelerator.
 
     Constructor arguments:
-    - clk_ns : clock period (used for IP preparation)
-    - max_qsrl_depth : FIFOs deeper than this will use Vivado IP instead of
-                       Verilog FIFOs (Q_srl.v)
-    - max_depth : how deep the "max"-sized FIFOs initially inserted will be
-    - swg_exception : call CapConvolutionFIFODepths to make convolution FIFOs
-                        smaller where appropriate
-    - vivado_ram_style : the StreamingFIFO.ram_style attribute to be used for
-                          large FIFOs implemented by Vivado
+
+    :parameter clk_ns: clock period (used for IP preparation)
+    :parameter max_qsrl_depth: FIFOs deeper than this will use Vivado IP
+        instead of Verilog FIFOs (Q_srl.v)
+    :parameter max_depth: how deep the "max"-sized FIFOs initially inserted
+        will be. If set to None, use the tensor size as the depth
+    :parameter swg_exception: call CapConvolutionFIFODepths to make convolution FIFOs
+        smaller where appropriate
+    :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute to be used
+        for large FIFOs implemented by Vivado afterwards
 
     Assumed input graph properties:
+
     - all nodes are fpgadataflow nodes
     - no FIFOs inserted,
-    - (inFIFODepth/outFIFODepth attrs will be ignored)
+    - (inFIFODepths/outFIFODepths attrs will be ignored)
 
     Output:
+
     - graph with appropriate-depth FIFOs inserted
 
     Background:
@@ -210,12 +224,14 @@ class InsertAndSetFIFODepths(Transformation):
     necessary to insert FIFOs between them to prevent stalls due to bursty
     behavior. The sizes of those FIFOs are hard to predict analytically, so
     we do the following:
-    - insert very deep (default 16k deep) FIFOs between all fpgadataflow nodes
+
+    - insert deep (=tensor size) FIFOs between all fpgadataflow nodes
     - create stitched design
     - run through rtlsim with stream of multiple random input images (to fill pipeline)
     - keep track of observed maximum occupancy for each FIFO during rtlsim
     - when sim finished, update each FIFO depth to maximum observed occupancy
-      and set inFIFODepth/outFIFODepth attrs to 0 on relevant nodes
+      and set inFIFODepths/outFIFODepths attrs to that depth as well
+
     """
 
     def __init__(
@@ -223,9 +239,10 @@ class InsertAndSetFIFODepths(Transformation):
         fpgapart,
         clk_ns=10.0,
         max_qsrl_depth=256,
-        max_depth=2**14,
+        max_depth=None,
         swg_exception=True,
         vivado_ram_style="auto",
+        force_python_sim=False,
     ):
         super().__init__()
         self.fpgapart = fpgapart
@@ -234,8 +251,12 @@ class InsertAndSetFIFODepths(Transformation):
         self.max_depth = max_depth
         self.swg_exception = swg_exception
         self.vivado_ram_style = vivado_ram_style
+        self.force_python_sim = force_python_sim
 
     def apply(self, model):
+        # these optypes may potentially use external weights
+        # we'll temporarily change them to use decoupled mode for FIFO sizing
+        extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"]
         # change external to decoupled and warn user
         # this way we are sure we have exactly one input/output
         modified_fc_nodes = []
@@ -246,9 +267,22 @@ class InsertAndSetFIFODepths(Transformation):
             )
             assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node"
             node = getCustomOp(node)
-            node.set_nodeattr("inFIFODepth", self.max_depth)
-            node.set_nodeattr("outFIFODepth", self.max_depth)
-            if node.onnx_node.op_type == "MatrixVectorActivation":
+            ifd = node.get_nodeattr("inFIFODepths")
+            ofd = node.get_nodeattr("outFIFODepths")
+            if self.max_depth is not None:
+                ifd = [self.max_depth] * len(ifd)
+                ofd = [self.max_depth] * len(ofd)
+            else:
+                # set each FIFO to its tensor size
+                # (except stream width hence the :-1)
+                for i in range(len(ifd)):
+                    ifd[i] = np.prod(node.get_folded_input_shape(i)[:-1])
+                for o in range(len(ofd)):
+                    ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1])
+            node.set_nodeattr("inFIFODepths", ifd)
+            node.set_nodeattr("outFIFODepths", ofd)
+
+            if node.onnx_node.op_type in extw_optypes:
                 mmode = node.get_nodeattr("mem_mode")
                 if mmode == "external":
                     modified_fc_nodes.append(node.onnx_node.name)
@@ -261,19 +295,23 @@ class InsertAndSetFIFODepths(Transformation):
 
         # insert stream infrastructure (DWC/FIFO)
         model = model.transform(InsertDWC())
-        model = model.transform(InsertFIFO())
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
 
         # gather FIFO names, check they are of expected depth
         fifos = {}
-        for node in model.graph.node:
-            if node.op_type == "StreamingFIFO":
-                fifos[node.name] = 0
-                node = getCustomOp(node)
-                # check depths and fix as necessary
-                if node.get_nodeattr("depth") != self.max_depth:
-                    node.set_nodeattr("depth", self.max_depth)
+        fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+        for node in fifo_nodes:
+            fifos[node.name] = 0
+            node = getCustomOp(node)
+            node.set_nodeattr("depth_monitor", 1)
+            node.set_nodeattr("impl_style", "rtl")
+            # check depths and fix as necessary
+            if (self.max_depth is not None) and (
+                node.get_nodeattr("depth") != self.max_depth
+            ):
+                node.set_nodeattr("depth", self.max_depth)
 
         # insert FIFOs and do all transformations for RTLsim
         model = model.transform(AnnotateCycles())
@@ -285,75 +323,84 @@ class InsertAndSetFIFODepths(Transformation):
         model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns))
         model.set_metadata_prop("exec_mode", "rtlsim")
 
-        # calculate input frequency (number of cycles for each input word)
-        first_node = getCustomOp(model.graph.node[0])
-        ncycles_per_input = max(
-            1,
-            int(
-                math.ceil(
-                    perf["max_cycles"]
-                    / (
-                        np.prod(first_node.get_folded_input_shape())
-                        / first_node.get_folded_input_shape()[-1]
+        if self.force_python_sim:
+            # do rtlsim in Python for FIFO sizing
+            # calculate input frequency (number of cycles for each input word)
+            first_node = getCustomOp(model.graph.node[0])
+            ncycles_per_input = max(
+                1,
+                int(
+                    math.ceil(
+                        perf["max_cycles"]
+                        / (
+                            np.prod(first_node.get_folded_input_shape())
+                            / first_node.get_folded_input_shape()[-1]
+                        )
                     )
-                )
-            ),
-        )
+                ),
+            )
 
-        # set sufficiently large threshold for 1 image to  fully execute and exit
-        ncycles = int(latency + max_cycles)
+            # set sufficiently large threshold for 1 image to  fully execute and exit
+            ncycles = int(latency + max_cycles)
 
-        # prepare pyverilator model
-        sim = pyverilate_stitched_ip(model)
+            # prepare pyverilator model
+            sim = pyverilate_stitched_ip(model)
 
-        reset_rtlsim(sim)
-        toggle_clk(sim)
+            reset_rtlsim(sim)
+            toggle_clk(sim)
 
-        # set all input valids to 0 and output readies to 1
-        # set input data to some constant
-        set_signal(sim, "tvalid", 0)
-        set_signal(sim, "tready", 1)
-        set_signal(sim, "tdata", 0)
+            # set all input valids to 0 and output readies to 1
+            # set input data to some constant
+            set_signal(sim, "tvalid", 0)
+            set_signal(sim, "tready", 1)
+            set_signal(sim, "tdata", 0)
+
+            output_detected = False
+            while ncycles > 0:
+                toggle_clk(sim)
+                # set/unset valids
+                if ncycles % ncycles_per_input == 0:
+                    set_signal(sim, "tvalid", 1)
+                else:
+                    set_signal(sim, "tvalid", 0)
 
-        output_detected = False
-        while ncycles > 0:
-            toggle_clk(sim)
-            # set/unset valids
-            if ncycles % ncycles_per_input == 0:
-                set_signal(sim, "tvalid", 1)
-            else:
-                set_signal(sim, "tvalid", 0)
-
-            # check/update all fifo counts
-            for key in fifos:
-                current_state = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["state"]
-                current_addr = sim.internals["finn_design_i"][key]["inst"][
-                    key + "_" + key
-                ]["addr"]
-                if current_state == 2:
-                    current_count = current_addr + 2
+                # since latency estimation is very pessimistic, detect first output
+                # and fast-forward the sim
+                if get_signal(sim, "tvalid") != 0 and not output_detected:
+                    ncycles = max_cycles
+                    output_detected = True
                 else:
-                    current_count = current_state
-                if current_count > fifos[key]:
-                    fifos[key] = current_count
-
-            # since latency estimation is very pessimistic, detect first output
-            # and fast-forward the sim
-            if get_signal(sim, "tvalid") != 0 and not output_detected:
-                ncycles = max_cycles
-                output_detected = True
+                    ncycles = ncycles - 1
+
+            if not output_detected:
+                warnings.warn(
+                    "No output detected, calculated FIFO depths may not be correct"
+                )
+        else:
+            # do rtlsim in C++ for FIFO sizing
+            # determine # inputs for FIFO sizing according to topology type
+            swg_nodes = [
+                x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type
+            ]
+            if len(swg_nodes) == 0:
+                # MLP, no layer overlap
+                # assuming half the nodes are now FIFOs, use half the # of
+                # nodes as # inputs to drive the imulation
+                n_inputs = int(len(model.graph.node) / 2)
             else:
-                ncycles = ncycles - 1
+                # convnet, two inputs are typically enough to fill entire
+                # layer pipeline due to overlaps
+                n_inputs = 2
+            sim = verilator_fifosim(model, n_inputs)
 
-        if not output_detected:
-            warnings.warn(
-                "No output detected, calculated FIFO depths may not be correct"
-            )
+        for ind, node in enumerate(fifo_nodes):
+            maxcount_name = "maxcount_%d" % ind
+            if ind == 0:
+                maxcount_name = "maxcount"
+            fifos[node.name] = sim[maxcount_name]
 
         # Apply depths back into the model;
-        # also set in/outFIFODepth to zero for non-FIFO
+        # also set in/outFIFODepths to zero for non-FIFO
         # nodes, preventing further FIFO insertion
         for node in model.graph.node:
             # set FIFO depth, reset FIFO implementation,
@@ -364,8 +411,14 @@ class InsertAndSetFIFODepths(Transformation):
                 depth = optimize_depth(fifos[node.name])
                 node_inst = getCustomOp(node)
                 node_inst.set_nodeattr("depth", depth)
+                node_inst.set_nodeattr("depth_monitor", 0)
+                # exception for top-level IO FIFOs which cause a bug in simulation
+                # (top-level IOs should not have impl_style=vivado)
+                toplevel_in = node.input[0] in [x.name for x in model.graph.input]
+                toplevel_out = node.output[0] in [x.name for x in model.graph.output]
+                toplevel_style_exception = toplevel_in or toplevel_out
                 # Set FIFO implementation/ram styles
-                if depth > self.max_qsrl_depth:
+                if (depth > self.max_qsrl_depth) and (not toplevel_style_exception):
                     node_inst.set_nodeattr("impl_style", "vivado")
                     node_inst.set_nodeattr("ram_style", self.vivado_ram_style)
                 else:
@@ -374,11 +427,10 @@ class InsertAndSetFIFODepths(Transformation):
                 reset_implementation(node_inst)
                 del fifos[node.name]
             else:
-                getCustomOp(node).set_nodeattr("inFIFODepth", 0)
-                getCustomOp(node).set_nodeattr("outFIFODepth", 0)
-                # for every FC node we changed from external to decoupled,
+                # (removed setting of node FIFO size attributes to 0 here)
+                # for every extw node we changed from external to decoupled,
                 # change back and reset implementation
-                if node.op_type == "MatrixVectorActivation":
+                if node.op_type in extw_optypes:
                     if node.name in modified_fc_nodes:
                         node_inst = getCustomOp(node)
                         node_inst.set_nodeattr("mem_mode", "external")
@@ -397,4 +449,172 @@ class InsertAndSetFIFODepths(Transformation):
         # remove shallow FIFOs
         model = model.transform(RemoveShallowFIFOs())
 
+        # reflect final values in attributes
+        for node in model.graph.node:
+            if node.op_type != "StreamingFIFO":
+                node_inst = getCustomOp(node)
+                fifodepth_in = []
+                for node_inp in node.input:
+                    prod = model.find_producer(node_inp)
+                    if prod is None:
+                        # no producer for this input
+                        if node_inp in [x.name for x in model.graph.input]:
+                            # top-level input with no FIFO
+                            fifodepth_in.append(0)
+                        else:
+                            # FIFO depth attr applies only to dynamic attributes
+                            pass
+                    else:
+                        # there is a producer for this input
+                        if prod.op_type == "StreamingFIFO":
+                            prod_inst = getCustomOp(prod)
+                            fifodepth_in.append(prod_inst.get_nodeattr("depth"))
+                        else:
+                            # explicitly no FIFO on this dynamic input
+                            fifodepth_in.append(0)
+                fifodepth_out = []
+                for node_out in node.output:
+                    cons = model.find_consumer(node_out)
+                    if cons is None:
+                        # no consumer for this output
+                        if node_out in [x.name for x in model.graph.output]:
+                            # top-level output with no FIFO
+                            fifodepth_out.append(0)
+                        else:
+                            # FIFO depth attr applies only to dynamic attributes
+                            pass
+                    else:
+                        # there is a consumer for this input
+                        if cons.op_type == "StreamingFIFO":
+                            cons_inst = getCustomOp(cons)
+                            fifodepth_out.append(cons_inst.get_nodeattr("depth"))
+                        else:
+                            # explicitly no FIFO on this dynamic output
+                            fifodepth_out.append(0)
+                node_inst.set_nodeattr("inFIFODepths", fifodepth_in)
+                node_inst.set_nodeattr("outFIFODepths", fifodepth_out)
+
+        return (model, False)
+
+
+def get_fifo_split_configs(depth, max_qsrl_depth=256, max_vivado_depth=32768):
+    """Break non-power-of-2 sized FIFO depths into several ones"""
+
+    def floor_pow2(x):
+        if (x & (x - 1) == 0) and x != 0:
+            return x
+        else:
+            return 1 << ((x - 1).bit_length() - 1)
+
+    def decompose_pow2(x):
+        if x <= max_qsrl_depth:
+            return [x]
+        else:
+            r = floor_pow2(x)
+            if x == r:
+                return [x]
+            else:
+                return [r, *decompose_pow2(x - r)]
+
+    ret = []
+    # trivial case: for small FIFOs, return as-is with rtl style
+    if depth <= max_qsrl_depth:
+        return [(depth, "rtl")]
+    # first pass: ensure max depth is respected
+    # (restricted by Vivado AXIS infra IP)
+    remainder = depth
+    while remainder != 0:
+        if remainder > max_vivado_depth:
+            ret.append(max_vivado_depth)
+            remainder -= max_vivado_depth
+        else:
+            ret.append(remainder)
+            remainder = 0
+    # second pass: break non-power-of-2 sized FIFOs
+    # into several ones
+
+    ret_pass2 = list(map(decompose_pow2, ret))
+    # unpack list of lists
+    ret_pass2 = [x for dec_list in ret_pass2 for x in dec_list]
+
+    # finally, add impl_style to each split FIFO
+    ret_final = []
+    for cand_depth in ret_pass2:
+        if cand_depth <= max_qsrl_depth:
+            ret_final.append((cand_depth, "rtl"))
+        else:
+            ret_final.append((cand_depth, "vivado"))
+
+    return ret_final
+
+
+class SplitLargeFIFOs(Transformation):
+    """Split large FIFOs before implementation, for two reasons:
+
+    - impl_style="vivado" supports a max depth of 32k. Any larger
+      FIFOs must be implemented as a sequence of smaller FIFOs.
+    - impl_style="vivado" requires power-of-two depths, which is
+      normally handled by rounding up to the nearest power-of-two.
+      So a FIFO of size 8196 normally gets rounded-up to a depth of
+      16384 and wastes a lot of resources. Here, instead, we split
+      this up into two FIFOs of depth 8192 + 4.
+
+    """
+
+    def __init__(self, max_qsrl_depth=256, max_vivado_depth=32768):
+        super().__init__()
+        self.max_qsrl_depth = max_qsrl_depth
+        self.max_vivado_depth = max_vivado_depth
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "StreamingFIFO":
+                n_inst = getCustomOp(node)
+                depth = n_inst.get_nodeattr("depth")
+                cfgs = get_fifo_split_configs(
+                    depth, self.max_qsrl_depth, self.max_vivado_depth
+                )
+                if len(cfgs) > 1:
+                    fld_shape = n_inst.get_folded_output_shape()
+                    dtype = n_inst.get_nodeattr("dataType")
+                    ram_style = n_inst.get_nodeattr("ram_style")
+                    shape = model.get_tensor_shape(node.input[0])
+                    for i, (fifo_depth, impl_style) in enumerate(cfgs):
+                        if i == 0:
+                            inp = node.input[0]
+                        else:
+                            inp = node.name + "_" + str(i - 1) + "_out"
+                        if i == len(cfgs) - 1:
+                            outp = node.output[0]
+                        else:
+                            outp = node.name + "_" + str(i) + "_out"
+                            out_tensor = helper.make_tensor_value_info(
+                                outp, TensorProto.FLOAT, shape
+                            )
+                            graph.value_info.append(out_tensor)
+                            model.set_tensor_datatype(out_tensor.name, DataType[dtype])
+                        fifo_node = helper.make_node(
+                            "StreamingFIFO",
+                            [inp],
+                            [outp],
+                            domain="finn.custom_op.fpgadataflow",
+                            backend="fpgadataflow",
+                            depth=fifo_depth,
+                            folded_shape=fld_shape,
+                            dataType=dtype,
+                            impl_style=impl_style,
+                            ram_style=ram_style,
+                            name=node.name + "_" + str(i),
+                        )
+                        graph.node.insert(node_ind + i, fifo_node)
+
+                    graph.node.remove(node)
+                    graph_modified = True
+        if graph_modified:
+            model = model.transform(SortGraph())
+            model = model.transform(GiveReadableTensorNames())
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 23943084ab99d6ab880a69975e0b4a49756905a7..2301fccdd4fff6310340ffe1dd8de7732a4f9bd4 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -62,17 +62,20 @@ class SetFolding(Transformation):
 
     Notable exceptions and special behavior:
 
-    * When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
+    When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation),
     which have two attributes (PE and SIMD):
-        * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
-          (configurable in the SetFolding initializer, defaults to 36)
-        * then increases PE until the target is met or max PE reached
 
-    * When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
+    * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max
+      (configurable in the SetFolding initializer, defaults to 36)
+    * then increases PE until the target is met or max PE reached
+
+    When folding depthwise convolutions ("VVAU"/VectorVectorActivation)
     or spatial reduction ops (Pool_Batch):
-        * the producer of the node is expected to be a ConvolutionInputGenerator
-        with depthwise=1, whose SIMD value will be set equal to the PE value of
-        its consumer node
+
+    * the producer of the node is expected to be a ConvolutionInputGenerator
+      with depthwise=1, whose SIMD value will be set equal to the PE value of
+      its consumer node
+
     """
 
     def __init__(
@@ -109,6 +112,7 @@ class SetFolding(Transformation):
             "FMPadding_Batch",
             "ConvolutionInputGenerator",
             "ConvolutionInputGenerator1D",
+            "ConvolutionInputGenerator_rtl",
         ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
@@ -171,10 +175,7 @@ class SetFolding(Transformation):
                             "Expected SWU on DW op input, found " + swu_node.op_type
                         )
             elif op_type in simd_ops:
-                if op_type in [
-                    "ConvolutionInputGenerator",
-                    "ConvolutionInputGenerator1D",
-                ]:
+                if op_type.startswith("ConvolutionInputGenerator"):
                     depthwise = node_inst.get_nodeattr("depthwise")
                     if depthwise == 0:
                         max_simd = node_inst.get_nodeattr("IFMChannels")
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 78bcdea0d701f97e9f80d7c7c489aa01bc93fa52..f52bad0ffb35ae4714acc24aef368d01967db426 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -126,6 +126,9 @@ if {$BOARD == "ZCU104"} {
 } elseif {$BOARD == "Pynq-Z1"} {
     set ZYNQ_TYPE "zynq_7000"
     set_property board_part www.digilentinc.com:pynq-z1:part0:1.0 [current_project]
+} elseif {$BOARD == "KV260_SOM"} {
+    set ZYNQ_TYPE "zynq_us+"
+    set_property board_part xilinx.com:kv260_som:part0:1.3 [current_project]
 } else {
     puts "Unrecognized board"
 }
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 855b30fe9573c534a13c961277ae4ab84507d619..e0a5666000fc2aa9599bb7475c1b8dd37489afac 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -358,16 +358,16 @@ class VitisBuild(Transformation):
     """Best-effort attempt at building the accelerator with Vitis.
     It assumes the model has only fpgadataflow nodes
 
-    fpga_part: string identifying the target FPGA
-    period_ns: target clock period
-    platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
-    strategy: Vitis optimization strategy
-    enable_debug: add Chipscope to all AXI interfaces
-    floorplan_file: path to a JSON containing a dictionary with SLR assignments
-                    for each node in the ONNX graph. Must be parse-able by
-                    the ApplyConfig transform.
-    enable_link: enable linking kernels (.xo files), otherwise just synthesize
-                    them independently.
+    :parameter fpga_part: string identifying the target FPGA
+    :parameter period_ns: target clock period
+    :parameter platform: target Alveo platform, one of ["U50", "U200", "U250", "U280"]
+    :parameter strategy: Vitis optimization strategy
+    :parameter enable_debug: add Chipscope to all AXI interfaces
+    :parameter floorplan_file: path to a JSON containing a dictionary with
+        SLR assignments for each node in the ONNX graph.
+        Must be parse-able by the ApplyConfig transform.
+    :parameter enable_link: enable linking kernels (.xo files),
+        otherwise just synthesize them independently.
     """
 
     def __init__(
@@ -411,12 +411,13 @@ class VitisBuild(Transformation):
         # Build each kernel individually
         sdp_nodes = model.get_nodes_by_op_type("StreamingDataflowPartition")
         for sdp_node in sdp_nodes:
+            prefix = sdp_node.name + "_"
             sdp_node = getCustomOp(sdp_node)
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
             kernel_model = kernel_model.transform(InsertFIFO())
             kernel_model = kernel_model.transform(RemoveUnusedTensors())
-            kernel_model = kernel_model.transform(GiveUniqueNodeNames())
+            kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
             kernel_model = kernel_model.transform(
                 PrepareIP(self.fpga_part, self.period_ns)
diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
index 967a1276365e4af1a6d617c081b9c04b4710da97..34f11d1e95e6bc3f6a36ce6d878ed493108b3ba6 100644
--- a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
+++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py
@@ -56,12 +56,12 @@ class ConvertQONNXtoFINN(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py
index 80b6042d03ea11a45493011288133ed3a6f57c8d..e8339ae24472fa238e5c5da176b1316611218a54 100644
--- a/src/finn/transformation/qonnx/fold_quant_weights.py
+++ b/src/finn/transformation/qonnx/fold_quant_weights.py
@@ -126,10 +126,20 @@ class FoldQuantWeights(Transformation):
                         model.set_tensor_datatype(node_out, new_dtype)
 
                         # Reshape scale for Conv if required
+                        target_output_shape = model.get_tensor_shape(
+                            target_node.output[0]
+                        )
                         if target_node.op_type == "Conv" and len(scale.shape) > 0:
-                            bias_shape = [1] * len(scale.shape)
-                            bias_shape[1] = -1
-                            scale = scale.reshape(bias_shape)
+                            conv_out_shape = [1] * len(target_output_shape)
+                            # only support per-output channel scaling
+                            # (i.e. all scale shape elems besides 0th must be 1s)
+                            if len(scale.shape) > 1:
+                                assert (
+                                    np.prod(scale.shape[1:]) == 1
+                                ), "Can't fold scale beyond per-out-channel granularity"
+                            # collect all scaling in channels dim (since we constrain)
+                            conv_out_shape[1] = -1
+                            scale = scale.reshape(conv_out_shape)
 
                         if scale.shape == (1,):
                             scale = scale[0]
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index a50a5850779cadf7ab21b9c1c4dfdbb36232af42..9819086d826a51d1df5240d88c4fda8513cc9ba6 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -52,9 +52,7 @@ class QuantActBaseHandler(ABC):
         self._q_node = quant_node
         self._q_index = quant_node_index
 
-    @property
     @classmethod
-    @abstractmethod
     def valid_predecessor_op_types(self):
         """Defines which op types the preceding node is allowed to have for
         this type of activation.
@@ -284,9 +282,11 @@ class QuantReluHandler(QuantActBaseHandler):
     """Class for converting a quantized relu operation expressed in the QONNX
     dialect to the FINN ONNX dialect."""
 
-    valid_predecessor_op_types = [
-        "Relu",
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "Relu",
+        ]
 
     def _check_compatibility(self):
         if self._q_node.op_type == "Quant":
@@ -391,15 +391,17 @@ class QuantIdentityHandler(QuantActBaseHandler):
     these are equivalent to quantized identity activations.
     """
 
-    valid_predecessor_op_types = [
-        "BatchNormalization",
-        "Sub",
-        "Add",
-        "Mul",
-        "Div",
-        "DebugMarker",
-        None,
-    ]
+    @classmethod
+    def valid_predecessor_op_types(self):
+        return [
+            "BatchNormalization",
+            "Sub",
+            "Add",
+            "Mul",
+            "Div",
+            "DebugMarker",
+            None,
+        ]
 
     def _check_compatibility(self):
         # Gather parameters to check
diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
index c52d69b0f09d306c5b076bb6ef1775f38977241a..48dda3820deb051bd8a291188f02fe7d1dd2cc0b 100644
--- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
+++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py
@@ -30,7 +30,10 @@
 import warnings
 from qonnx.transformation.base import Transformation
 
-from finn.transformation.qonnx.qonnx_activation_handlers import QuantActBaseHandler
+from finn.transformation.qonnx.qonnx_activation_handlers import (
+    QuantActBaseHandler,
+    QuantIdentityHandler,
+)
 
 
 def default_filter_function_generator(max_multithreshold_bit_width=8):
@@ -66,8 +69,7 @@ def default_filter_function_generator(max_multithreshold_bit_width=8):
 
 
 class ConvertQuantActToMultiThreshold(Transformation):
-    """
-    Converts Quant nodes in the activation path to MultiThreshold nodes.
+    """Converts Quant nodes in the activation path to MultiThreshold nodes.
 
     The optional keyword argument `filter_function`
     presents a way to control which Quant and BipolarQuant nodes in the activation path
@@ -75,12 +77,12 @@ class ConvertQuantActToMultiThreshold(Transformation):
     is not converted to a MultiThreshold node.
 
     :param filter_function: Each candidate Quant and BinaryQant node is first evaluated
-    by this function. If the function returns False,
-    then the node is not converted to a MultiTrheshold node.
-    The function is given the model and candidate node as parameters.
-    Per default a filter function is inserted, which disables the conversion of
-    Quant nodes, which have a bit width of larger than 8.
-    Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
+        by this function. If the function returns False,
+        then the node is not converted to a MultiTrheshold node.
+        The function is given the model and candidate node as parameters.
+        Per default a filter function is inserted, which disables the conversion of
+        Quant nodes, which have a bit width of larger than 8.
+        Defaults to: default_filter_function_generator(max_multithreshold_bit_width=8)
     """
 
     def __init__(
@@ -110,11 +112,6 @@ class ConvertQuantActToMultiThreshold(Transformation):
                     predecessor_op_type = predecessor[0].op_type
                 else:
                     predecessor_op_type = predecessor
-                if model.is_fork_node(n):
-                    raise ValueError(
-                        "Forking Quant/BipolarQuant nodes are currently "
-                        "not supported by FINN."
-                    )
                 if n.op_type == "Quant" and not model.get_initializer(n.input[2]) == 0:
                     raise ValueError(
                         "Only Quant nodes with zero-point == 0 are currently supported."
@@ -132,7 +129,7 @@ class ConvertQuantActToMultiThreshold(Transformation):
                 # Check for possible ambiguity in handler selection
                 valid_predecessors = []
                 for cls in QuantActBaseHandler.__subclasses__():
-                    valid_predecessors.extend(cls.valid_predecessor_op_types)
+                    valid_predecessors.extend(cls.valid_predecessor_op_types())
                 if len(valid_predecessors) != len(set(valid_predecessors)):
                     raise RuntimeError(
                         "Two or more activation handlers declare the same "
@@ -143,16 +140,15 @@ class ConvertQuantActToMultiThreshold(Transformation):
 
                 # Try to find a fitting handler for this Quant activation node
                 for handler_cls in QuantActBaseHandler.__subclasses__():
-                    if predecessor_op_type in handler_cls.valid_predecessor_op_types:
+                    if predecessor_op_type in handler_cls.valid_predecessor_op_types():
                         handler = handler_cls(model, n, node_ind)
                         break
                 else:
-                    raise ValueError(
-                        f"Quant nodes in the activation path and with predecessor "
-                        f"nodes of type {predecessor_op_type} are currently not "
-                        f"supported by FINN and can not be converted to "
-                        f"MultiThreshold nodes."
-                    )
+                    # fall back to QuantIdentityHandler here
+                    # it may still not work due to its particular restrictions,
+                    # but better than just erroring out without trying
+                    handler = QuantIdentityHandler(model, n, node_ind)
+
                 model = handler.replace_quant_node()
                 graph_modified = True
                 return (model, graph_modified)
diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 0299c4f4d89d1fdd94434db77c77a0e529c86d26..73df52f890d227137ea076804d161206e66653dc 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -473,7 +473,7 @@ class AbsorbConsecutiveTransposes(Transformation):
     """Remove (Transpose -> Transpose) patterns when the input and output
     of the pattern have the same layout."""
 
-    def Are_opposite_permutations(self, perms1, perms2):
+    def are_opposite_permutations(self, perms1, perms2):
         if len(perms1) != len(perms2):
             return False
         assert 0 <= max(perms2) < len(perms2), "invalid permutation"
@@ -488,72 +488,42 @@ class AbsorbConsecutiveTransposes(Transformation):
     def apply(self, model):
         graph = model.graph
         graph_modified = False
-        for n in graph.node:
-            if n.op_type == "Transpose":
-                if model.is_fork_node(n):
-                    next_nodes = model.find_direct_successors(n)
-                    perms1 = list(get_by_name(n.attribute, "perm").ints)
-
-                    # check if all nodes after fork are opposite transposes
-                    all_opposite_transposes = True
-                    for next_node in next_nodes:
-                        if next_node is not None and next_node.op_type == "Transpose":
-                            perms2 = list(get_by_name(next_node.attribute, "perm").ints)
-                            if not self.Are_opposite_permutations(perms1, perms2):
-                                all_opposite_transposes = False
-                                break
-                        else:
-                            all_opposite_transposes = False
-                            break
-
-                    if not all_opposite_transposes:
-                        continue
-
-                    prod = model.find_producer(n.input[0])
-                    for next_node in next_nodes:
-                        # connect next_node's consumer input to n's producer output
-                        # TODO implement this to allow for forks as producers and
-                        # joins as consumers
-                        cons = model.find_consumer(next_node.output[0])
-                        cons.input[0] = prod.output[0]
-
-                        # remove consumer transpose
-                        graph.node.remove(next_node)
-
-                    # remove producer transpose
-                    graph.node.remove(n)
-                    graph_modified = True
-
-                else:
-                    next_node = model.find_consumer(n.output[0])
+        for node in graph.node:
+            if node.op_type == "Transpose":
+                next_nodes = model.find_consumers(node.output[0])
+                perms1 = list(get_by_name(node.attribute, "perm").ints)
+                if len(next_nodes) == 0:
+                    continue
+                # check if all nodes after fork are opposite transposes
+                all_opposite_transposes = True
+                for next_node in next_nodes:
                     if next_node is not None and next_node.op_type == "Transpose":
-                        perms1 = list(get_by_name(n.attribute, "perm").ints)
                         perms2 = list(get_by_name(next_node.attribute, "perm").ints)
-                        if self.Are_opposite_permutations(perms1, perms2):
-
-                            # connect next_node's consumer input to n's producer output
-                            # TODO implement this to allow for forks as producers
-                            consumers = model.find_direct_successors(next_node)
-                            prod = model.find_producer(n.input[0])
-                            if prod is not None:
-                                for cons in consumers:
-                                    for cons_in in cons.input:
-                                        if cons_in == next_node.output[0]:
-                                            prod.output[0] = cons_in
-                                            break
-                            else:
-                                # n.input[0] is top-level graph input
-                                # wire consumers directly to that
-                                for cons in consumers:
-                                    for i, iname in enumerate(cons.input):
-                                        if iname == next_node.output[0]:
-                                            cons.input[i] = n.input[0]
-
-                            # remove both transposes
-                            graph.node.remove(n)
-                            graph.node.remove(next_node)
+                        if not self.are_opposite_permutations(perms1, perms2):
+                            all_opposite_transposes = False
+                            break
+                    else:
+                        all_opposite_transposes = False
+                        break
+                if not all_opposite_transposes:
+                    continue
+                source_tensor = node.input[0]
+                for next_node in next_nodes:
+                    # connect next_node's consumers' appropriate input to n's input
+                    # TODO how to handle top-level outputs if any?
+                    nextnode_out = next_node.output[0]
+                    assert nextnode_out not in [x.name for x in model.graph.output]
+                    consumers = model.find_consumers(nextnode_out)
+                    for cons in consumers:
+                        for i, iname in enumerate(cons.input):
+                            if iname == nextnode_out:
+                                cons.input[i] = source_tensor
+                    # remove consumer transpose
+                    graph.node.remove(next_node)
+                # remove producer transpose
+                graph.node.remove(node)
+                graph_modified = True
 
-                            graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
@@ -612,7 +582,6 @@ class AbsorbTransposeIntoResize(Transformation):
                             trans_input = mt_cand.output[0]
                             trans_output = new_tensor_name
                         # fix tensor shapes for Resize and Transpose
-                        # n, c, h, w = model.get_tensor_shape(mt_cand.input[0])
                         n, c, hx, wx = model.get_tensor_shape(mt_cand.output[0])
                         model.set_tensor_shape(trans_input, (n, hx, wx, c))
                         model.set_tensor_shape(trans_output, (n, c, hx, wx))
@@ -623,13 +592,13 @@ class AbsorbTransposeIntoResize(Transformation):
                             [trans_output],
                             perm=[0, 3, 1, 2],
                         )
-                        graph.node.insert(node_ind + 1, new_transpose)
                         # rewire nodes
                         final_t_cands = model.find_consumers(mt_cand.output[0])
                         # rewire next nodes' inputs
                         for final_t_cand in final_t_cands:
                             final_t_cand.input[0] = trans_output
                         mt_cand.output[0] = trans_input
+                        graph.node.insert(node_ind + 1, new_transpose)
                         graph_modified = True
         if graph_modified:
             model = model.transform(InferDataTypes())
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 9ff8a2173ce81e2a19c56bbd20a326759c3b9df2..29eefacc32370598ddcd39283d022f5eb61f3f0c 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -553,6 +553,8 @@ class MoveLinearPastEltwiseAdd(Transformation):
                 # Other transform should handle that
                 if prod0 is None or prod1 is None or (prod0 == prod1):
                     continue
+                if len(prod0.input) < 2 or len(prod1.input) < 2:
+                    continue
                 init0 = model.get_initializer(prod0.input[1])
                 init1 = model.get_initializer(prod1.input[1])
                 # if either initializer is None, skip
@@ -723,14 +725,86 @@ class MakeMaxPoolNHWC(Transformation):
         return (model, graph_modified)
 
 
+class MakeScaleResizeNHWC(Transformation):
+    """
+    Converts the inputs and outputs for all scales Resize and Upsample nodes
+    from NCHW to NHWC.
+    """
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        for n in graph.node:
+            node_ind += 1
+            if n.op_type == "Upsample" or n.op_type == "Resize":
+                if model.get_tensor_layout(n.input[0]) != DataLayout.NCHW:
+                    warnings.warn(
+                        "%s: Input not NCHW. Can't operate transformation on node."
+                        % n.name
+                    )
+                    continue
+                consumer = model.find_consumer(n.output[0])
+                producer = model.find_producer(n.input[0])
+                if n.op_type == "Upsample":
+                    scales_ind = 1
+                else:
+                    scales_ind = 2
+                if producer is not None and producer.op_type == "Transpose":
+                    perms = list(get_by_name(producer.attribute, "perm").ints)
+                    if perms == [0, 3, 1, 2]:
+                        old_value = model.get_initializer(n.input[scales_ind])
+                        new_value = np.array(
+                            [old_value[idx] for idx in (0, 2, 3, 1)],
+                            dtype=np.dtype("float32"),
+                        )
+                        model.set_initializer(n.input[scales_ind], new_value)
+                        start_name = producer.input[0]
+                        mid_name = n.input[0]
+                        end_name = n.output[0]
+                        (b, hi, wi, c) = model.get_tensor_shape(start_name)
+                        (b, c, ho, wo) = model.get_tensor_shape(end_name)
+                        producer.input[0] = mid_name
+                        producer.output[0] = end_name
+                        n.input[0] = start_name
+                        n.output[0] = mid_name
+                        model.set_tensor_shape(mid_name, (b, ho, wo, c))
+                        model.set_tensor_shape(end_name, (b, c, ho, wo))
+                        graph.node.remove(producer)
+                        graph.node.insert(node_ind, producer)
+                elif consumer is not None and consumer.op_type == "Transpose":
+                    perms = list(get_by_name(consumer.attribute, "perm").ints)
+                    if perms == [0, 2, 3, 1]:
+                        old_value = model.get_initializer(n.input[scales_ind])
+                        new_value = np.array(
+                            [old_value[idx] for idx in (0, 2, 3, 1)],
+                            dtype=np.dtype("float32"),
+                        )
+                        model.set_initializer(n.input[scales_ind], new_value)
+                        start_name = n.input[0]
+                        mid_name = consumer.input[0]
+                        end_name = consumer.output[0]
+                        (b, c, hi, wi) = model.get_tensor_shape(start_name)
+                        (b, c, ho, wo) = model.get_tensor_shape(mid_name)
+                        consumer.input[0] = start_name
+                        consumer.output[0] = mid_name
+                        n.input[0] = mid_name
+                        n.output[0] = end_name
+                        model.set_tensor_shape(mid_name, (b, hi, wi, c))
+                        model.set_tensor_shape(end_name, (b, ho, wo, c))
+                        graph.node.remove(consumer)
+                        graph.node.insert(node_ind - 1, consumer)
+        return (model, False)
+
+
 class MoveOpPastFork(Transformation):
     """Move node operations past graph forks. Used when a node before a fork
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list):
+    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
         super().__init__()
         self.ops_to_move = op_name_list
+        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -747,9 +821,10 @@ class MoveOpPastFork(Transformation):
 
                 # Restrict this transform to operations with constant parameters
                 # Assuming parameters is in input 1
-                op_init_param = model.get_initializer(n.input[1])
-                if op_init_param is None:
-                    continue
+                if len(n.input) > 1:
+                    op_init_param = model.get_initializer(n.input[1])
+                else:
+                    op_init_param = None
 
                 # Check case when branches are empty and go
                 # to the same node
@@ -766,16 +841,20 @@ class MoveOpPastFork(Transformation):
 
                 for consumer_node in consumers[1:]:
                     # create new node
-                    new_param_name = model.make_new_valueinfo_name()
                     new_output_tensor_name = model.make_new_valueinfo_name()
+                    if op_init_param is None:
+                        new_inp_list = [n.input[0]]
+                    else:
+                        new_param_name = model.make_new_valueinfo_name()
+                        new_inp_list = [n.input[0], new_param_name]
+                        model.set_initializer(new_param_name, op_init_param)
+                    attrs = self.get_attrs_fxn(n)
+                    # TODO use copy of original node instead to get attrs?
                     new_node = oh.make_node(
-                        n.op_type,
-                        [n.input[0], new_param_name],
-                        [new_output_tensor_name],
+                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
                     )
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
-                    model.set_initializer(new_param_name, op_init_param)
 
                     # change consumer input tensor
                     graph.node.remove(consumer_node)
@@ -811,6 +890,13 @@ class MoveLinearPastFork(MoveOpPastFork):
         super().__init__(["Add", "Mul"])
 
 
+class MoveTransposePastFork(MoveOpPastFork):
+    def __init__(self):
+        super().__init__(
+            ["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints}
+        )
+
+
 class MoveMaxPoolPastMultiThreshold(Transformation):
     """Move MaxPool nodes past MultiThreshold nodes on linear segments of the graph."""
 
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 4aba87216c8999612f748e989a945ceff33da167..3bc5b803db2072f4d0ed3829adab93b4fbd3b98e 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -40,6 +40,8 @@ pynq_part_map["ZCU102"] = "xczu9eg-ffvb1156-2-e"
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 pynq_part_map["ZCU111"] = "xczu28dr-ffvg1517-2-e"
 pynq_part_map["RFSoC2x2"] = "xczu28dr-ffvg1517-2-e"
+pynq_part_map["KV260_SOM"] = "xck26-sfvc784-2LV-c"
+
 
 # native AXI HP port width (in bits) for PYNQ boards
 pynq_native_port_width = dict()
@@ -50,6 +52,7 @@ pynq_native_port_width["ZCU102"] = 128
 pynq_native_port_width["ZCU104"] = 128
 pynq_native_port_width["ZCU111"] = 128
 pynq_native_port_width["RFSoC2x2"] = 128
+pynq_native_port_width["KV260_SOM"] = 128
 
 # Alveo device and platform mappings
 alveo_part_map = dict()
diff --git a/src/finn/util/create.py b/src/finn/util/create.py
index a8c2e67b385b797905cd4c5a196091069898b583..ed3e1a843eca47d2e20e9ca1c9df0d2d6f5a8a13 100644
--- a/src/finn/util/create.py
+++ b/src/finn/util/create.py
@@ -30,7 +30,11 @@ import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 
 def hls_random_mlp_maker(layer_spec):
@@ -84,7 +88,7 @@ def hls_mlp_maker(layer_spec):
 
     graph = helper.make_graph(nodes=[], name="mlp", inputs=[], outputs=[])
 
-    model = helper.make_model(graph, producer_name="finn")
+    model = qonnx_make_model(graph, producer_name="finn")
     model = ModelWrapper(model)
 
     for lyr in layer_spec:
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 65478d2540b53443d3f74b44a22fde3defd8ca93..797dad32a2cfeb3e00e224f264d91b5ee0e9247b 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -265,7 +265,7 @@ def numpy_to_hls_code(
     # define a function to convert a single element into a C++ init string
     # a single element can be a hex string if we are using packing
     def elem2str(x):
-        if type(x) == str or type(x) == np.str_ or type(x) == np.str:
+        if type(x) == str or type(x) == np.str_:
             return '%s("%s", 16)' % (hls_dtype, x)
         elif type(x) == np.float32:
             if dtype.is_integer():
diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py
index f6a51da8e44ea60ae5693cdd033b39bdf51376ac..8d188585694c172d97d73fa6b5820edb7b48a948 100644
--- a/src/finn/util/pyverilator.py
+++ b/src/finn/util/pyverilator.py
@@ -26,32 +26,43 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pkg_resources as pk
+
+import numpy as np
 import os
+import shutil
 from pyverilator import PyVerilator
+from qonnx.custom_op.registry import getCustomOp
 
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import (
+    get_rtlsim_trace_depth,
+    launch_process_helper,
+    make_build_dir,
+)
 
 
-def pyverilate_stitched_ip(
-    model,
-    read_internal_signals=True,
-    disable_common_warnings=True,
-    extra_verilator_args=[],
-):
-    """Given a model with stitched IP, return a PyVerilator sim object.
-    Trace depth is also controllable, see get_rtlsim_trace_depth()
+def make_single_source_file(filtered_verilog_files, target_file):
+    """Dump all Verilog code used by stitched IP into a single file.
+    This is because large models with many files require a verilator
+    command line too long for bash on most systems"""
 
-    :param read_internal_signals  If set, it will be possible to examine the
-        internal (not only port) signals of the Verilog module, but this may
-        slow down compilation and emulation.
+    # concatenate all verilog code into a single file
+    with open(target_file, "w") as wf:
+        for vfile in filtered_verilog_files:
+            with open(vfile) as rf:
+                wf.write("//Added from " + vfile + "\n\n")
+                lines = rf.read()
+                for line in lines.split("\n"):
+                    # break down too-long lines, Verilator complains otherwise
+                    if len(line) > 20000:
+                        line = line.replace("&", "\n&")
+                    wf.write("\n" + line)
 
-    :param disable_common_warnings If set, disable the set of warnings that
-        Vivado-HLS-generated Verilog typically triggers in Verilator
-        (which can be very verbose otherwise)
 
-    """
-    if PyVerilator is None:
-        raise ImportError("Installation of PyVerilator is required.")
+def prepare_stitched_ip_for_verilator(model):
+    """Prepare sources from given stitched IP for verilator simulation, including
+    generating a single source file and replacing certain Vivado infrastructure
+    headers with Verilator-compatible ones"""
 
     vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
     with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
@@ -64,8 +75,6 @@ def pyverilate_stitched_ip(
         return os.path.basename(os.path.realpath(x))
 
     top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
-    top_module_name = top_module_file_name.strip(".v")
-    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     # dump all Verilog code to a single file
     # this is because large models with many files require
@@ -74,10 +83,33 @@ def pyverilate_stitched_ip(
     # are identical but in multiple directories (regslice_core.v)
 
     # remove duplicates from list by doing list -> set -> list
+    src_exts = [".v", ".sv"]
+
     all_verilog_files = list(
-        set(filter(lambda x: x.endswith(".v") or x.endswith(".sv"), all_verilog_srcs))
+        set(
+            filter(
+                lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs
+            )
+        )
     )
 
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+    os.makedirs(verilog_header_dir, exist_ok=True)
+
+    # use custom version of axis infrastructure vh
+    # to enable Verilator to simulate AMD/Xilinx components (e.g DWC)
+    custom_vh = pk.resource_filename(
+        "finn.qnn-data", "verilog/custom_axis_infrastructure.vh"
+    )
+    shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh")
+    for fn in all_verilog_srcs:
+        if fn.endswith(".vh"):
+            if "axis_infrastructure_v1_1_0.vh" in fn:
+                # skip, we use a custom version for this file without recursive gcd
+                continue
+            else:
+                shutil.copy(fn, verilog_header_dir)
+
     # remove all but one instances of regslice_core.v
     filtered_verilog_files = []
     remove_entry = False
@@ -89,12 +121,176 @@ def pyverilate_stitched_ip(
         else:
             filtered_verilog_files.append(vfile)
 
-    # concatenate all verilog code into a single file
-    with open(vivado_stitch_proj_dir + "/" + top_module_file_name, "w") as wf:
-        for vfile in filtered_verilog_files:
-            with open(vfile) as rf:
-                wf.write("//Added from " + vfile + "\n\n")
-                wf.write(rf.read())
+    target_file = vivado_stitch_proj_dir + "/" + top_module_file_name
+    make_single_source_file(filtered_verilog_files, target_file)
+
+    return vivado_stitch_proj_dir
+
+
+def verilator_fifosim(model, n_inputs, max_iters=100000000):
+    """Create a Verilator model of stitched IP and use a simple C++
+    driver to drive the input stream. Useful for FIFO sizing, latency
+    and throughput measurement."""
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+    build_dir = make_build_dir("verilator_fifosim_")
+    fifosim_cpp_fname = pk.resource_filename(
+        "finn.qnn-data", "cpp/verilator_fifosim.cpp"
+    )
+    with open(fifosim_cpp_fname, "r") as f:
+        fifosim_cpp_template = f.read()
+    assert len(model.graph.input) == 1, "Only a single input stream is supported"
+    assert len(model.graph.output) == 1, "Only a single output stream is supported"
+    iname = model.graph.input[0].name
+    first_node = model.find_consumer(iname)
+    oname = model.graph.output[0].name
+    last_node = model.find_producer(oname)
+    assert (first_node is not None) and (
+        last_node is not None
+    ), "Failed to find first/last nodes"
+    fnode_inst = getCustomOp(first_node)
+    lnode_inst = getCustomOp(last_node)
+    ishape_folded = fnode_inst.get_folded_input_shape()
+    oshape_folded = lnode_inst.get_folded_output_shape()
+
+    fifo_log = []
+    fifo_log_templ = '    results_file << "maxcount%s" << "\\t" '
+    fifo_log_templ += "<< to_string(top->maxcount%s) << endl;"
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")
+    fifo_ind = 0
+    for fifo_node in fifo_nodes:
+        fifo_node = getCustomOp(fifo_node)
+        if fifo_node.get_nodeattr("depth_monitor") == 1:
+            suffix = "" if fifo_ind == 0 else "_%d" % fifo_ind
+            fifo_log.append(fifo_log_templ % (suffix, suffix))
+            fifo_ind += 1
+    fifo_log = "\n".join(fifo_log)
+
+    template_dict = {
+        "ITERS_PER_INPUT": np.prod(ishape_folded[:-1]),
+        "ITERS_PER_OUTPUT": np.prod(oshape_folded[:-1]),
+        "N_INPUTS": n_inputs,
+        "MAX_ITERS": max_iters,
+        "FIFO_DEPTH_LOGGING": fifo_log,
+    }
+
+    for (key, val) in template_dict.items():
+        fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val))
+
+    with open(build_dir + "/verilator_fifosim.cpp", "w") as f:
+        f.write(fifosim_cpp_template)
+
+    which_verilator = shutil.which("verilator")
+    if which_verilator is None:
+        raise Exception("'verilator' executable not found")
+
+    # add defines to make certain XPM src files work with Verilator
+    xpm_args = []
+    xpm_args.append("-DDISABLE_XPM_ASSERTIONS")
+    xpm_args.append("-DOBSOLETE")
+    xpm_args.append("-DONESPIN")
+    xpm_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
+    verilog_file_arg = ["finn_design_wrapper.v", xpm_memory, xpm_cdc, xpm_fifo]
+
+    verilator_args = [
+        "perl",
+        which_verilator,
+        "-Wno-fatal",
+        "-Mdir",
+        build_dir,
+        "-y",
+        vivado_stitch_proj_dir,
+        "-y",
+        verilog_header_dir,
+        "--CFLAGS",
+        "--std=c++11",
+        "-O3",
+        "--x-assign",
+        "fast",
+        "--x-initial",
+        "fast",
+        "--noassert",
+        "--cc",
+        *verilog_file_arg,
+        "--top-module",
+        "finn_design_wrapper",
+        "--exe",
+        "verilator_fifosim.cpp",
+        "--threads",
+        "4",
+        *xpm_args,
+    ]
+
+    proc_env = os.environ.copy()
+    gcc_args = "-O3 -march=native"
+    proc_env["OPT_FAST"] = gcc_args
+    make_args = [
+        "make",
+        "-j4",
+        "-C",
+        build_dir,
+        "-f",
+        "Vfinn_design_wrapper.mk",
+        "Vfinn_design_wrapper",
+    ]
+
+    with open(build_dir + "/compile.sh", "w") as f:
+        f.write("#!/bin/bash" + "\n")
+        f.write("export OPT_FAST='%s'\n" % gcc_args)
+        f.write(" ".join(verilator_args) + "\n")
+        f.write(" ".join(make_args) + "\n")
+
+    launch_process_helper(verilator_args, cwd=build_dir)
+    launch_process_helper(make_args, proc_env=proc_env, cwd=build_dir)
+
+    sim_launch_args = ["./Vfinn_design_wrapper"]
+    launch_process_helper(sim_launch_args, cwd=build_dir)
+
+    with open(build_dir + "/results.txt", "r") as f:
+        results = f.read().strip().split("\n")
+    ret_dict = {}
+    for result_line in results:
+        key, val = result_line.split("\t")
+        ret_dict[key] = int(val)
+    return ret_dict
+
+
+def pyverilate_stitched_ip(
+    model,
+    read_internal_signals=True,
+    disable_common_warnings=True,
+    extra_verilator_args=[],
+):
+    """Given a model with stitched IP, return a PyVerilator sim object.
+    Trace depth is also controllable, see get_rtlsim_trace_depth()
+
+    :param read_internal_signals  If set, it will be possible to examine the
+        internal (not only port) signals of the Verilog module, but this may
+        slow down compilation and emulation.
+
+    :param disable_common_warnings If set, disable the set of warnings that
+        Vivado-HLS-generated Verilog typically triggers in Verilator
+        (which can be very verbose otherwise)
+
+    """
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
+
+    vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model)
+    verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh"
+
+    def file_to_basename(x):
+        return os.path.basename(os.path.realpath(x))
+
+    top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+    top_module_name = top_module_file_name.strip(".v")
+    build_dir = make_build_dir("pyverilator_ipstitched_")
 
     verilator_args = []
     # disable common verilator warnings that should be harmless but commonly occur
@@ -108,10 +304,20 @@ def pyverilate_stitched_ip(
     # force inlining of all submodules to ensure we can read internal signals properly
     if read_internal_signals:
         verilator_args += ["--inline-mult", "0"]
+    # add defines to make certain XPM src files work with Verilator
+    verilator_args.append("-DDISABLE_XPM_ASSERTIONS")
+    verilator_args.append("-DOBSOLETE")
+    verilator_args.append("-DONESPIN")
+    verilator_args.append("--bbox-unsup")
+    vivado_path = os.environ["VIVADO_PATH"]
+    # additional SystemVerilog modules to make XPMs work with Verilator
+    xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv"
+    xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv"
+    xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv"
 
     sim = PyVerilator.build(
-        top_module_file_name,
-        verilog_path=[vivado_stitch_proj_dir],
+        [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc],
+        verilog_path=[vivado_stitch_proj_dir, verilog_header_dir],
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
         top_module_name=top_module_name,
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index f5d3b1c30b8b7b439eae1c684ad84b33a3401c7c..bd8bde2820fa87ed972d699cae905d7f6cc310ff 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -91,8 +91,8 @@ def soft_verify_topk(invec, idxvec, k):
     """Check that the topK indices provided actually point to the topK largest
     values in the input vector"""
     np_topk = np.flip(invec.flatten().argsort())[:k]
-    soft_expected = invec.flatten()[np_topk.astype(np.int).flatten()]
-    soft_produced = invec.flatten()[idxvec.astype(np.int).flatten()]
+    soft_expected = invec.flatten()[np_topk.astype(np.int_).flatten()]
+    soft_produced = invec.flatten()[idxvec.astype(np.int_).flatten()]
     return (soft_expected == soft_produced).all()
 
 
@@ -180,6 +180,7 @@ def execute_parent(parent_path, child_path, input_tensor_npy, return_full_ctx=Fa
     sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
     sdp_node = getCustomOp(sdp_node)
     sdp_node.set_nodeattr("model", child_path)
+    sdp_node.set_nodeattr("return_full_exec_context", 1 if return_full_ctx else 0)
     ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True)
     if return_full_ctx:
         return ret
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
index aaeb3ab920d1d8fae79c1173582d18cf81d03063..1f77276d5a72e5f886d5f94af8d35121ccadd486 100644
--- a/src/finn/util/vcd.py
+++ b/src/finn/util/vcd.py
@@ -101,19 +101,21 @@ def get_stream_if_stats(vcd_file, if_base_name):
     <stream_state>: (<num_samples>, <fraction_of_time>),
 
     where <stream_state> is the combination of (V)alid/(R)eady values,
-    <num_samples> is the approximate number of rising clock edges spent in <state>
-    , and <fraction_of_time> is the fraction of <num_samples> to total
+    <num_samples> is the approximate number of rising clock edges spent in <state>,
+    and <fraction_of_time> is the fraction of <num_samples> to total
     amount of time recorded by the trace.
 
     Example:
-    {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
-     "{'V': 1, 'R': 0}": (0, 0.0),
-     "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
-     "{'V': 1, 'R': 1}": (640, 0.07757575757575758)}
-
+    {
+    "{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
+    "{'V': 1, 'R': 0}": (0, 0.0),
+    "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
+    "{'V': 1, 'R': 1}": (640, 0.07757575757575758)
+    }
     Here we can see the stream was transmitting values 7.7% of the time,
     and 9.2% of the time there was no incoming data (valid 0, ready 1)
     """
+
     if_valid = if_base_name + vname
     if_ready = if_base_name + rname
     v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True)
diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py
index b0c3d6088c27291f1f49dd2f1ee746b65ca0a737..3dc46ec31e49d7115b19b3373d54be6ddc29bb80 100644
--- a/tests/brevitas/test_brevitas_relu_act_export.py
+++ b/tests/brevitas/test_brevitas_relu_act_export.py
@@ -41,6 +41,7 @@ from brevitas.nn import QuantReLU
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
+from torch import nn
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
@@ -179,3 +180,83 @@ scaling_impl.learned_value": rand_tensor.type(
 
     assert np.isclose(produced, expected, atol=1e-3).all()
     os.remove(export_onnx_path)
+
+
+class PyTorchTestModel(nn.Module):
+    def __init__(self, abits):
+        super(PyTorchTestModel, self).__init__()
+        out_channels = 32
+        self.b_act = QuantReLU(
+            bit_width=abits,
+            quant_type=QuantType.INT,
+            scaling_impl_type=ScalingImplType.PARAMETER,
+            scaling_per_channel=True,
+            restrict_scaling_type=RestrictValueType.LOG_FP,
+            scaling_min_val=2e-16,
+            max_val=6.0,
+            return_quant_tensor=False,
+            per_channel_broadcastable_shape=(1, out_channels, 1, 1),
+        )
+
+    def forward(self, x):
+        act_out = self.b_act(x)
+        y0 = act_out * 2.0
+        y1 = act_out * -1.0
+        y = y0 + y1
+        return y
+
+
+@pytest.mark.brevitas_export
+@pytest.mark.parametrize("abits", [2, 4, 8])
+@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)])
+@pytest.mark.parametrize("scaling_per_channel", [True])
+@pytest.mark.parametrize("QONNX_export", [True])
+def test_brevitas_act_export_relu_forking(
+    abits, max_val, scaling_per_channel, QONNX_export
+):
+    out_channels = 32
+    ishape = (1, out_channels, 1, 1)
+    min_val = -1.0
+    model_pyt = PyTorchTestModel(abits)
+
+    rand_tensor = (2) * torch.rand((1, out_channels, 1, 1))
+
+    checkpoint = {
+        "b_act.act_quant_proxy.fused_activation_quant_proxy."
+        "tensor_quant.scaling_impl.learned_value": rand_tensor.type(torch.FloatTensor)
+    }
+    model_pyt.load_state_dict(checkpoint)
+
+    if QONNX_export:
+        m_path = export_onnx_path
+        BrevitasONNXManager.export(model_pyt, ishape, m_path)
+        qonnx_cleanup(m_path, out_file=m_path)
+        model = ModelWrapper(m_path)
+        model = model.transform(ConvertQONNXtoFINN())
+        model.save(m_path)
+
+    model = ModelWrapper(export_onnx_path)
+    model = model.transform(InferShapes())
+    inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(
+        np.float32
+    )
+    idict = {model.graph.input[0].name: inp_tensor}
+    odict = oxe.execute_onnx(model, idict, True)
+    produced = odict[model.graph.output[0].name]
+    inp_tensor = torch.from_numpy(inp_tensor).float()
+    model_pyt.eval()
+    expected = model_pyt.forward(inp_tensor).detach().numpy()
+    if not np.isclose(produced, expected, atol=1e-3).all():
+        print(abits, max_val)
+        print("scale: ", model_pyt.quant_act_scale().type(torch.FloatTensor).detach())
+        if abits < 5:
+            print(
+                "thres:",
+                ", ".join(["{:8.4f}".format(x) for x in model_pyt.export_thres[0]]),
+            )
+        print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]]))
+        print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]]))
+        print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]]))
+
+    assert np.isclose(produced, expected, atol=1e-3).all()
+    os.remove(export_onnx_path)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 103f18b514c23c4e1ad35a85d020dc0481aa9c47..858363d6d31c7c17803bffdb87e7b168dec4b76d 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -80,7 +80,6 @@ from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
@@ -103,7 +102,7 @@ from finn.util.test import (
 )
 
 build_dir = os.environ["FINN_BUILD_DIR"]
-target_clk_ns = 10
+target_clk_ns = 20
 mem_mode = "decoupled"
 rtlsim_trace = False
 
@@ -565,12 +564,6 @@ class TestEnd2End:
         model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
         fifo_layers = model.get_nodes_by_op_type("StreamingFIFO")
         assert len(fifo_layers) > 0
-        hls_layers = model.get_finn_nodes()
-        for node in hls_layers:
-            if node.op_type != "StreamingFIFO":
-                op_inst = getCustomOp(node)
-                assert op_inst.get_nodeattr("inFIFODepth") == 0
-                assert op_inst.get_nodeattr("outFIFODepth") == 0
         model.save(
             get_checkpoint_name(
                 topology, wbits, abits, QONNX_export, "fifodepth_" + kind
@@ -597,7 +590,6 @@ class TestEnd2End:
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
-        model = model.transform(PrepareRTLSim())
         model.set_metadata_prop("exec_mode", "rtlsim")
         os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1))
         if rtlsim_trace:
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index b6482dc96c4d866618d19d810fa9385b20aa0222..290afc308498490cbee2fc75c30e22bb474eb96a 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -229,6 +229,7 @@ def test_end2end_cybsec_mlp_build(QONNX_export):
 
 
 @pytest.mark.end2end
+@pytest.mark.xfail
 @pytest.mark.parametrize("QONNX_export", [False, True])
 def test_end2end_cybsec_mlp_run_on_hw(QONNX_export):
     build_env = get_build_env(build_kind, target_clk_ns)
diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py
index 9483ccf0b27ebc385ed017d0a0b316ab189a1f96..0a92c74a38d64ade37d576f3830f3a5628c94d88 100644
--- a/tests/end2end/test_ext_weights.py
+++ b/tests/end2end/test_ext_weights.py
@@ -90,6 +90,7 @@ def test_end2end_ext_weights_build():
     output_dir = make_build_dir("test_end2end_ext_weights_build")
     cfg = build.DataflowBuildConfig(
         output_dir=output_dir,
+        verbose=True,
         folding_config_file=folding_config_file,
         synth_clk_period_ns=target_clk_ns,
         board=build_env["board"],
@@ -113,6 +114,7 @@ def test_end2end_ext_weights_build():
 
 @pytest.mark.board
 @pytest.mark.end2end
+@pytest.mark.xfail
 def test_end2end_ext_weights_dataset():
     # make sure we have local copies of mnist dataset files
     subprocess.check_output(["mkdir", "-p", mnist_local])
@@ -129,6 +131,7 @@ def test_end2end_ext_weights_dataset():
 
 
 @pytest.mark.end2end
+@pytest.mark.xfail
 def test_end2end_ext_weights_run_on_hw():
     build_env = get_build_env(build_kind, target_clk_ns)
     deploy_dir = get_checkpoint_name("build")
diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py
index 49ee32c71ee941ff7435d4c12ccadae3f8e55c5e..f5edabbd4ba029899239cc2f40dd6a94d178eafd 100644
--- a/tests/fpgadataflow/test_code_gen_trafo.py
+++ b/tests/fpgadataflow/test_code_gen_trafo.py
@@ -32,7 +32,7 @@ import os
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 
@@ -70,7 +70,7 @@ def test_code_gen_trafo():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py
index 9bafb101cedabc99d97356069c883cab4ed8a87f..d04b68a56ba7fc5f01e1eef57075636954f86843 100644
--- a/tests/fpgadataflow/test_compilation_trafo.py
+++ b/tests/fpgadataflow/test_compilation_trafo.py
@@ -32,7 +32,7 @@ import os
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -71,7 +71,7 @@ def test_compilation_trafo():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
index 5bbaefac2d3e5f800fbb9471df6469235271c2f3..98a7c76ee4de0332586772ba7c1007ee55979a51 100644
--- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py
@@ -38,7 +38,7 @@ from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -66,11 +66,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
     ],
 )
 @pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_rtl_swg", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
+def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode):
     pad, kernel_size, stride, dilation = conv_config
     np.random.seed(0)
     idt = DataType["UINT4"]
@@ -84,6 +85,9 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     pad_h = pad[0] + pad[2]
     pad_w = pad[1] + pad[3]
 
+    if use_rtl_swg and exec_mode == "cppsim":
+        pytest.skip("cppsim not supported for RTL SWG")
+
     if depthwise is True:
         group = out_chn = in_chn
         conv_param_shape = [out_chn, 1, k_h, k_w]
@@ -117,7 +121,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
         helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
     ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="conv_test",
             inputs=[top_in],
@@ -139,7 +143,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, exec_mode):
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
-    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg))
     if depthwise is True:
         new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
index 0f19b6d79ab0ed77981022f286fabd430094d69f..089d1ae420f4fab744fcda5950d88b13216b4c93 100644
--- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -57,7 +57,7 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape):
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape)
     p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape)
 
-    model = helper.make_model(
+    model = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[inp],
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
index 0760ff9b37487f4a1ac06853055d2e47b7269f9e..3512c39cb3fab04e4e4225728c9495b546b7c655 100755
--- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py
@@ -39,7 +39,7 @@ from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -149,7 +149,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape):
             "Flatten", ["thres1_out"], ["flatten_out"], axis=1
         )
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[global_in],
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 55dc77cafb898ead28a7cbb9641e0b40db276919..de31ef0f125cb96ea82f953eadb9d5ccf7aab16c 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -38,7 +38,7 @@ from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -57,11 +57,12 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
     "conv_config", [(1, 2, 0), (1, 3, 0), (3, 2, 1), (3, 1, 0), (3, 1, 1), (5, 2, 1)]
 )
 @pytest.mark.parametrize("depthwise", [False, True])
+@pytest.mark.parametrize("use_rtl_swg", [False, True])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
+def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode):
     kernel_size, stride, pad = conv_config
     np.random.seed(0)
     idt = DataType["UINT4"]
@@ -69,6 +70,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     in_feature_dim = 7
     in_chn = 16
 
+    if use_rtl_swg and exec_mode == "cppsim":
+        pytest.skip("cppsim not supported for RTL SWG")
+
+    if use_rtl_swg and kernel_size == 1:
+        pytest.skip("1x1 kernel not supported by current RTL SWG")
+
     if depthwise is True:
         group = out_chn = in_chn
         conv_param_shape = [out_chn, 1, kernel_size, kernel_size]
@@ -100,7 +107,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
         helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
     ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="conv_test",
             inputs=[top_in],
@@ -122,7 +129,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     model = model.transform(InferDataTypes())
 
     new_model = model.transform(LowerConvsToMatMul())
-    new_model = new_model.transform(to_hls.InferConvInpGen())
+    new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg))
     if depthwise is True:
         new_model = new_model.transform(to_hls.InferVectorVectorActivation())
     else:
@@ -156,6 +163,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
     x = gen_finn_dt_tensor(idt, input_shape)
     inp_dict = {model.graph.input[0].name: x}
     assert oxe.compare_execution(model, new_model, inp_dict)
+
     if kernel_size == 1 and stride > 1 and pad == 0:
         assert new_model.graph.node[1].op_type == "DownSampler"
         if exec_mode == "rtlsim":
@@ -167,8 +175,11 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
             assert np.isclose(exp_cycles, cycles_rtlsim, atol=11)
             assert exp_cycles != 0
 
-    if pad == 1:
-        padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
+    if pad:
+        if use_rtl_swg:
+            padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0]
+        else:
+            padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0]
         padding_inst = getCustomOp(padding_node)
         assert padding_inst.get_nodeattr("SIMD") == in_chn
 
diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
index 79a48793e0c4f062654e43aadcaf09ebf6d7da5b..c837a46a7ca7dcab6628cbf16373161b7b9ab9c2 100644
--- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
+++ b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py
@@ -43,7 +43,7 @@ from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -123,7 +123,7 @@ def make_model(ch, ifmdim):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="add-model")
+    model = qonnx_make_model(graph, producer_name="add-model")
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index ef9bd7a13dcecf7aa61ecb982ac6393d7813a4d5..6d628c9e53828fef88028bdc115bd64b0292dfed 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
@@ -78,7 +78,7 @@ def make_single_maxpool_modelwrapper(
         nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -112,7 +112,7 @@ def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, id
         nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index 5228ade3d0f4db3bd99f5fcccb7aee41f57ed73b..8ab22bcfdcb0312bd49677f0e00d8e97cdcad3c1 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -37,7 +37,11 @@ from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -123,7 +127,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         outputs=[global_out],
         value_info=value_info,
     )
-    model = oh.make_model(graph, producer_name="lowered_dw_cnv-model")
+    model = qonnx_make_model(graph, producer_name="lowered_dw_cnv-model")
     model = ModelWrapper(model)
 
     # initialize model
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4f2b8dbfff0d720ec4eb901704581b096c0ea40
--- /dev/null
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2022 Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pytest
+
+import json
+import shutil
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.util.basic import make_build_dir
+from finn.util.test import get_trained_network_and_ishape
+
+
+def fetch_test_model(topology, wbits=2, abits=2):
+    tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
+    (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
+    chkpt_name = tmp_output_dir + "/model.onnx"
+    BrevitasONNXManager.export(model, ishape, chkpt_name)
+    return tmp_output_dir
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize(
+    "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
+)
+@pytest.mark.parametrize("topology", ["tfc"])
+def test_fifosizing_linear(method, topology):
+    force_python_rtlsim = "python" in method
+    method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
+    tmp_output_dir = fetch_test_model(topology)
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=True,
+        auto_fifo_strategy=method_key,
+        target_fps=10000 if topology == "tfc" else 1000,
+        force_python_rtlsim=force_python_rtlsim,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        rtlsim_batch_size=100,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+        default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED,
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+    assert (
+        float(sim_data["throughput[images/s]"])
+        / float(est_data["estimated_throughput_fps"])
+        > 0.9
+    )
+    # now run the same build using the generated folding and FIFO config
+    tmp_output_dir_cmp = fetch_test_model(topology)
+    cfg_cmp = cfg
+    cfg_cmp.output_dir = tmp_output_dir_cmp
+    cfg_cmp.auto_fifo_depths = False
+    cfg_cmp.target_fps = None
+    cfg_cmp.generate_outputs = [build_cfg.DataflowOutputType.STITCHED_IP]
+    cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json"
+    build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp)
+
+    model0 = ModelWrapper(
+        tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx"
+    )
+    model1 = ModelWrapper(
+        tmp_output_dir_cmp + "/intermediate_models/step_create_stitched_ip.onnx"
+    )
+
+    assert len(model0.graph.node) == len(model1.graph.node)
+    for i in range(len(model0.graph.node)):
+        node0 = model0.graph.node[i]
+        node1 = model1.graph.node[i]
+        assert node0.op_type == node1.op_type
+        if node0.op_type == "StreamingFIFO":
+            node0_inst = getCustomOp(node0)
+            node1_inst = getCustomOp(node1)
+            assert node0_inst.get_nodeattr("depth") == node1_inst.get_nodeattr("depth")
+
+    shutil.rmtree(tmp_output_dir)
+    shutil.rmtree(tmp_output_dir_cmp)
diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
index 6d881f45b60384d9a78b5d9f9705581a10b48e6c..1ad2c26610c99c46bde4c05ed156a81b122aba53 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -68,7 +68,7 @@ def make_addstreams_modelwrapper(ch, pe, idt):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = qonnx_make_model(graph, producer_name="addstreams-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp1", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index ceafda90e54004c7aea8786d003b6adf1defab35..13fab9a47f15999c184680b9db04494787889881 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -73,7 +73,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
     )
     graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp])
 
-    model = helper.make_model(graph, producer_name="model")
+    model = qonnx_make_model(graph, producer_name="model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py
index 5e79ea2dad2aa4200f998fd8953672b9f49b2b86..cd404f5a6332d77f17ec69c47b53c8c893f28607 100644
--- a/tests/fpgadataflow/test_fpgadataflow_checksum.py
+++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py
@@ -36,7 +36,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.core.rtlsim_exec import rtlsim_exec
@@ -115,7 +115,7 @@ def create_two_fc_model():
         value_info=[mid],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -133,6 +133,7 @@ def create_two_fc_model():
     return model
 
 
+@pytest.mark.vivado
 @pytest.mark.fpgadataflow
 def test_fpgadataflow_checksum():
     # use a graph consisting of two fc layers to test
diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
index dddc470ec2ed88faf078f19bd0d2a7a4a6b5b6cd..5fff286e54e64b71481a3c2801850a37613fd694 100644
--- a/tests/fpgadataflow/test_fpgadataflow_concat.py
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -72,6 +72,7 @@ def make_concat_model(i_shapes, idt):
 
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.parametrize("idt", [DataType["INT4"]])
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_concat(exec_mode, idt):
@@ -107,6 +108,7 @@ def test_fpgadataflow_concat(exec_mode, idt):
     assert (exp_out == ret_sim[oname]).all()
 
 
+@pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
 def test_fpgadataflow_concat_stitchedip():
@@ -144,6 +146,5 @@ def test_fpgadataflow_concat_stitchedip():
     )
     model.set_metadata_prop("exec_mode", "rtlsim")
     model.set_metadata_prop("rtlsim_trace", "trace.vcd")
-    model.save("dbg.onnx")
     ret_sim = execute_onnx(model, inp_dict)
     assert (exp_out == ret_sim[oname]).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index a196ecbb61b74843ddc8efa4ac3c5ab8197e64fe..3cfff9ac34ae47bdc072bca9f6ca0fffeea756c5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -73,7 +73,7 @@ def make_single_im2col_modelwrapper(
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="im2col-model")
+    model = qonnx_make_model(graph, producer_name="im2col-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -117,7 +117,7 @@ def make_single_slidingwindow_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
index 0fc3ca82cfa919079a324160e4876377ac4dc3b4..f467f37618bbee6359bb7b7dfa963e3d8785d0c9 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -82,7 +82,7 @@ def make_single_im2col_modelwrapper(
         nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="im2col-model")
+    model = qonnx_make_model(graph, producer_name="im2col-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -133,7 +133,7 @@ def make_single_slidingwindow_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
new file mode 100755
index 0000000000000000000000000000000000000000..58fc5ec04cc471b0e8f201e235ac9bd033e3f5c4
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py
@@ -0,0 +1,260 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = qonnx_make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        M=m,
+        parallel_window=parallel_window,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [6])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1, 2, 3, 6])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_slidingwindow_rtl(
+    idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip
+):
+    if flip:
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
+            pytest.skip("Dimension flip would have no effect")
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
+        k_w == 1 and (stride_w != 1 or dilation_w != 1)
+    ):
+        pytest.skip(
+            """Illegal convolution configuration:
+            stride or dilation defined for unitary kernel dim"""
+        )
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
+        pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")
+    if parallel_window and simd != ifm_ch:
+        pytest.skip("Parallel window requires SIMD=C")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        m=m,
+        parallel_window=parallel_window,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareRTLSim())
+
+    # prepare input data
+    input_dict = prepare_inputs(x)
+    # execute model
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    golden = make_single_im2col_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+    )
+    y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(
+            1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+        )
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+        assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f7bf649a9284e7716aec5adfb91957fdabb55d5
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -0,0 +1,617 @@
+# Copyright (c) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import copy
+import numpy as np
+import onnx.parser as oprs
+import os
+from onnx import TensorProto, helper
+from pyverilator.util.axi_utils import axilite_write, reset_rtlsim
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import (
+    LowerConvsToMatMul,
+    _auto_pad_to_explicit_padding,
+)
+from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+from finn.core.onnx_exec import execute_onnx
+from finn.core.rtlsim_exec import rtlsim_exec
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.util.basic import pyverilate_get_liveness_threshold_cycles
+
+
+def create_conv_model(
+    idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
+):
+    np.random.seed(0)
+    group = ifm if depthwise else 1
+    group_str = str(group)
+    ishp = (1, ifm, idim_h, idim_w)
+    pad_0 = _auto_pad_to_explicit_padding(
+        pad_mode, idim_h, idim_w, k, k, stride, stride, 2
+    )
+    int_dim_h = compute_conv_output_dim(
+        idim_h, k, stride, total_pad=pad_0[0] + pad_0[2]
+    )
+    int_dim_w = compute_conv_output_dim(
+        idim_w, k, stride, total_pad=pad_0[1] + pad_0[3]
+    )
+
+    pad_1 = _auto_pad_to_explicit_padding(
+        pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2
+    )
+    odim_h = compute_conv_output_dim(
+        int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2]
+    )
+    odim_w = compute_conv_output_dim(
+        int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3]
+    )
+    oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)
+    wshp = (ifm, 1, k, k) if depthwise else (ofm, ifm, k, k)
+    wshp_1 = (ifm, 1, k, k) if depthwise else (ofm, ofm, k, k)
+    ishp_str = str(list(ishp))
+    oshp_str = str(list(oshp))
+    wshp_str = str(list(wshp))
+    wshp_1_str = str(list(wshp_1))
+    kshp_str = str([k, k])
+    pad_0_str = str(list(pad_0))
+    pad_1_str = str(list(pad_1))
+    stride_str = str([stride, stride])
+    dil_str = str([1, 1])
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{ishp_str} in0) => (float{oshp_str} out0)
+    <
+        float{wshp_str} param_c0_weight,
+        float{wshp_1_str} param_c1_weight
+    >
+    {{
+        conv0 = Conv<
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_0_str},
+                strides={stride_str}
+            >(in0, param_c0_weight)
+        out0 = Conv<
+                dilations={dil_str},group={group_str},kernel_shape={kshp_str},pads={pad_1_str},
+                strides={stride_str}
+            >(conv0, param_c1_weight)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model.set_tensor_datatype("in0", idt)
+    model.set_tensor_datatype("param_c0_weight", wdt)
+    model.set_tensor_datatype("param_c1_weight", wdt)
+    model.set_initializer("param_c0_weight", gen_finn_dt_tensor(wdt, wshp))
+    model.set_initializer("param_c1_weight", gen_finn_dt_tensor(wdt, wshp_1))
+    return model
+
+
+def update_conv_model_dims(model, idim_new_h, idim_new_w):
+    cnode = model.get_nodes_by_op_type("Conv")[0]
+    k, _ = get_by_name(cnode.attribute, "kernel_shape").ints
+    stride, _ = get_by_name(cnode.attribute, "strides").ints
+    ishp = model.get_tensor_shape("in0")
+    n, ci, _, _ = ishp
+    n, co, _, _ = model.get_tensor_shape("out0")
+    int_dim_h = compute_conv_output_dim(idim_new_h, k, stride)
+    int_dim_w = compute_conv_output_dim(idim_new_w, k, stride)
+    odim_h = compute_conv_output_dim(int_dim_h, k, stride)
+    odim_w = compute_conv_output_dim(int_dim_w, k, stride)
+    model.set_tensor_shape("in0", (n, ci, idim_new_h, idim_new_w))
+    model.set_tensor_shape("out0", (n, co, odim_h, odim_w))
+    # remove all existing shapes
+    del model.graph.value_info[:]
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+# Helper function to update tensor dimensions manually because shape inference
+# does not work on FINN nodes (they assume well-defined tensor shapes).
+def update_tensor_dim(model, tensor_name, new_hw):
+    shape = model.get_tensor_shape(tensor_name)
+    shape[1] = new_hw[0]
+    shape[2] = new_hw[1]
+    model.set_tensor_shape(tensor_name, shape)
+
+
+# Helper function that delivers the hook to program the SWG via AXI-Lite
+def config_hook(configs):
+    if configs is None:
+        return None
+
+    def write_swg_config(sim):
+        reset_rtlsim(sim)
+        for axi_name, config in configs:
+            # Write config registers to the SWG/FMPadding dict
+            # defines (addr, value) tuples
+            for config_entry in config.values():
+                axilite_write(sim, config_entry[0], config_entry[1], basename=axi_name)
+        reset_rtlsim(sim)
+
+    return write_swg_config
+
+
+cfg0 = {
+    "idims": [(32, 32), (8, 8)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+}
+cfg1 = {
+    "idims": [(32, 16), (16, 8)],
+    "ifm": 4,
+    "k": 4,
+    "stride": 1,
+    "ofm": 8,
+    "depthwise": False,
+    "pad_mode": "SAME_UPPER",
+}
+cfg2 = {
+    "idims": [(64, 128), (2, 4)],
+    "ifm": 64,
+    "k": 3,
+    "stride": 1,
+    "ofm": 64,
+    "depthwise": True,
+    "pad_mode": "SAME_UPPER",
+}
+
+
+@pytest.mark.parametrize("cfg", [cfg0, cfg1, cfg2])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_conv_dynamic(cfg):
+    pad_mode = cfg["pad_mode"]
+    depthwise = cfg["depthwise"]
+    idims = cfg["idims"]
+    ifm = cfg["ifm"]
+    k = cfg["k"]
+    stride = cfg["stride"]
+    ofm = cfg["ofm"]
+    idt = DataType["UINT4"]
+    wdt = DataType["INT2"]
+    exp_cfgs = []
+    largest_model = None
+    for idim in idims:
+        idim_h, idim_w = idim
+        ishp = (1, ifm, idim_h, idim_w)
+        np.random.seed(0)
+        inp = gen_finn_dt_tensor(idt, ishp)
+        model = create_conv_model(
+            idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise
+        )
+        _, _, int_dim_h, int_dim_w = model.get_tensor_shape("conv0")
+        _, _, odim_h, odim_w = model.get_tensor_shape("out0")
+        pad0 = get_by_name(model.graph.node[0].attribute, "pads").ints
+        pad1 = get_by_name(model.graph.node[1].attribute, "pads").ints
+        if idim == max(idims):
+            # use largest model for hardware conversion
+            largest_model = copy.deepcopy(model)
+        golden = execute_onnx(model, {"in0": inp})["out0"]
+        exp_cfg = (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        )
+        exp_cfgs.append(exp_cfg)
+
+    # convert to hardware and prepare simulation
+    model = largest_model.transform(LowerConvsToMatMul())
+    model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
+    model = model.transform(
+        to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")
+    )
+    model = model.transform(to_hls.InferVectorVectorActivation())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
+    parent_model = model.transform(CreateDataflowPartition())
+    sdp_inst = getCustomOp(
+        parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+    )
+    model = ModelWrapper(sdp_inst.get_nodeattr("model"))
+    assert len(model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")) == 2
+    if pad_mode == "VALID":
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 0
+    else:
+        assert len(model.get_nodes_by_op_type("FMPadding_rtl")) == 2
+    dyn_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
+    dyn_nodes += model.get_nodes_by_op_type("FMPadding_rtl")
+    for swg_node in dyn_nodes:
+        getCustomOp(swg_node).set_nodeattr("SIMD", 4)
+        getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1)
+        getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16])
+        getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16])
+    comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation")
+    comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation")
+    for comp_node in comp_nodes:
+        if depthwise:
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+        else:
+            getCustomOp(comp_node).set_nodeattr("SIMD", 4)
+            getCustomOp(comp_node).set_nodeattr("PE", 4)
+    model = model.transform(InsertDWC())
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    # loop through experiment configurations
+    for exp_cfg in exp_cfgs:
+        (
+            (idim_h, idim_w),
+            (int_dim_h, int_dim_w),
+            (odim_h, odim_w),
+            pad0,
+            pad1,
+            inp,
+            golden,
+        ) = exp_cfg
+        conv0_idim_h = idim_h + pad0[0] + pad0[2]
+        conv0_idim_w = idim_w + pad0[1] + pad0[3]
+        conv1_idim_h = int_dim_h + pad1[0] + pad1[2]
+        conv1_idim_w = int_dim_w + pad1[1] + pad1[3]
+        # get config for the new dimensions
+        swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")
+        swg0 = getCustomOp(swg_nodes[0])
+        update_tensor_dim(model, swg0.onnx_node.input[0], (conv0_idim_h, conv0_idim_w))
+        update_tensor_dim(model, swg0.onnx_node.output[0], (int_dim_h, int_dim_w))
+        swg_config0 = swg0.get_dynamic_config((conv0_idim_h, conv0_idim_w))
+        swg1 = getCustomOp(swg_nodes[1])
+        update_tensor_dim(model, swg1.onnx_node.input[0], (conv1_idim_h, conv1_idim_w))
+        update_tensor_dim(model, swg1.onnx_node.output[0], (odim_h, odim_w))
+        swg_config1 = swg1.get_dynamic_config((conv1_idim_h, conv1_idim_w))
+        if pad_mode != "VALID":
+            pad_nodes = model.get_nodes_by_op_type("FMPadding_rtl")
+            padder0 = getCustomOp(pad_nodes[0])
+            update_tensor_dim(model, padder0.onnx_node.input[0], (idim_h, idim_w))
+            update_tensor_dim(
+                model, padder0.onnx_node.output[0], (conv0_idim_h, conv0_idim_w)
+            )
+            pad_config0 = padder0.get_dynamic_config((idim_h, idim_w), pad0)
+            padder1 = getCustomOp(pad_nodes[1])
+            update_tensor_dim(model, padder1.onnx_node.input[0], (int_dim_h, int_dim_w))
+            update_tensor_dim(
+                model, padder1.onnx_node.output[0], (conv1_idim_h, conv1_idim_w)
+            )
+            pad_config1 = padder1.get_dynamic_config((int_dim_h, int_dim_w), pad1)
+            configs = [
+                ("s_axilite_0_", pad_config0),
+                ("s_axilite_1_", swg_config0),
+                ("s_axilite_2_", pad_config1),
+                ("s_axilite_3_", swg_config1),
+            ]
+        else:
+            configs = [("s_axilite_0_", swg_config0), ("s_axilite_1_", swg_config1)]
+        # adjust folded shapes for I/O FIFOs
+        # (since rtlsim_exec uses folded shape info to fold global i/o tensors)
+        first_node = getCustomOp(model.graph.node[0])
+        first_node_shp = list(first_node.get_folded_input_shape())
+        first_node_shp[1] = idim_h
+        first_node_shp[2] = idim_w
+        first_node.set_nodeattr("folded_shape", first_node_shp)
+        update_tensor_dim(model, first_node.onnx_node.input[0], (idim_h, idim_w))
+        last_node = getCustomOp(model.graph.node[-1])
+        last_node_shp = list(last_node.get_folded_output_shape())
+        last_node_shp[1] = odim_h
+        last_node_shp[2] = odim_w
+        update_tensor_dim(model, last_node.onnx_node.output[0], (odim_h, odim_w))
+        last_node.set_nodeattr("folded_shape", last_node_shp)
+        ctx = {"global_in": inp.transpose(0, 2, 3, 1)}
+        liveness_prev = pyverilate_get_liveness_threshold_cycles()
+        os.environ["LIVENESS_THRESHOLD"] = "100000"
+        rtlsim_exec(model, ctx, pre_hook=config_hook(configs))
+        os.environ["LIVENESS_THRESHOLD"] = str(liveness_prev)
+        ret = ctx["global_out"].transpose(0, 3, 1, 2)
+        assert np.isclose(golden, ret).all()
+
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = qonnx_make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        "ConvolutionInputGenerator_rtl",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        M=m,
+        parallel_window=parallel_window,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+        dynamic_mode=1,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = qonnx_make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[3, 3]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim_series", [[[32, 32], [16, 16], [8, 8]]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [6])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 1]])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [2, 6])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_slidingwindow_rtl_dynamic(
+    idt, k, ifm_dim_series, ifm_ch, stride, dilation, dw, simd, m, parallel_window
+):
+    # Begin test by generating RTL SWG normally for the first FM of the series.
+    # The following FM dimensions must be equal or smaller than the initial
+    # dimensions (in terms of required buffer depth).
+    ifm_dim = ifm_dim_series[0]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip(
+            "Illegal convolution configuration: kernel or stride > FM dimension"
+        )
+    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
+        k_w == 1 and (stride_w != 1 or dilation_w != 1)
+    ):
+        pytest.skip(
+            """Illegal convolution configuration:
+            stride or dilation defined for unitary kernel dim"""
+        )
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
+        pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)")
+    if parallel_window and simd != ifm_ch:
+        pytest.skip("Parallel window requires SIMD=C")
+
+    model = make_single_slidingwindow_modelwrapper(
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        m=m,
+        parallel_window=parallel_window,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+    )
+
+    # Simulate using stitched-ip-rtlsim so we can use existing infrastructure
+    # that supports hook functions to re-program configuration before rtlsim
+    model = model.transform(InsertFIFO(True))  # required for proper simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    # Simulate 1 FM for each dimension in the series
+    for i, ifm_dim in enumerate(ifm_dim_series):
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        configs = None
+        if i > 0:  # skip re-programming for initial FM dimension
+            # Necessary update of node and tensor attributes to make rtlsim work:
+            swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+            swg_inst = getCustomOp(swg_node)
+            update_tensor_dim(model, swg_node.input[0], ifm_dim)
+            update_tensor_dim(model, swg_node.output[0], ofm_dim)
+
+            # Generate config, also overwrites IFMDim/OFMDim attributes:
+            config = swg_inst.get_dynamic_config(ifm_dim)
+            configs = [("s_axilite_0_", config)]
+
+            # Also update FIFO nodes and corresponding tensors
+            fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[0]
+            fifo_inst = getCustomOp(fifo_node)
+            shape = fifo_inst.get_nodeattr("folded_shape")
+            shape[1] = ifm_dim_h
+            shape[2] = ifm_dim_w
+            fifo_inst.set_nodeattr("folded_shape", shape)
+            update_tensor_dim(model, fifo_node.input[0], ifm_dim)
+
+            fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[1]
+            fifo_inst = getCustomOp(fifo_node)
+            shape = fifo_inst.get_nodeattr("folded_shape")
+            shape[1] = ofm_dim_h
+            shape[2] = ofm_dim_w
+            fifo_inst.set_nodeattr("folded_shape", shape)
+            update_tensor_dim(model, fifo_node.output[0], ofm_dim)
+
+        # Run rtlsim on stitched-ip
+        x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+        context = prepare_inputs(x)
+        rtlsim_exec(model, context, pre_hook=config_hook(configs))
+        y_produced = context["outp"]
+
+        # Generate golden result
+        golden = make_single_im2col_modelwrapper(
+            k=k,
+            ifm_ch=ifm_ch,
+            ifm_dim=ifm_dim,
+            ofm_dim=ofm_dim,
+            stride=stride,
+            dilation=dilation,
+            idt=idt,
+        )
+        input_dict = prepare_inputs(x)
+        y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
+
+        # Check result
+        if dw == 0:
+            assert (y_produced == y_expected).all()
+        else:
+            y_expected = y_expected.reshape(
+                1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd
+            )
+            y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+            y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w)
+            assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..64da0a2368a69d6037c681d88391eef2844dae2c
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022, Xilinx, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx.parser as oprs
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False):
+    np.random.seed(0)
+    out_dim = compute_conv_output_dim(in_dim, k, stride, 2 * pad_half)
+    ifm = 8
+    ofm = 16
+    if is_1d:
+        if flip_1d:
+            shape_in = [1, ifm, 1, in_dim]
+            shape_out = [1, ofm, 1, out_dim]
+            shape_k = [1, k]
+            shape_s = [1, stride]
+            shape_p = [0, pad_half, 0, pad_half]
+        else:
+            shape_in = [1, ifm, in_dim, 1]
+            shape_out = [1, ofm, out_dim, 1]
+            shape_k = [k, 1]
+            shape_s = [stride, 1]
+            shape_p = [pad_half, 0, pad_half, 0]
+    else:
+        shape_in = [1, ifm, in_dim, in_dim]
+        shape_out = [1, ofm, out_dim, out_dim]
+        shape_k = [k, k]
+        shape_s = [stride, stride]
+        shape_p = [pad_half, pad_half, pad_half, pad_half]
+    shape_w = [ofm, ifm] + shape_k
+
+    sstr_in = str(shape_in)
+    sstr_out = str(shape_out)
+    sstr_k = str(shape_k)
+    sstr_s = str(shape_s)
+    sstr_p = str(shape_p)
+    sstr_w = str(shape_w)
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{sstr_in} in0) => (float{sstr_out} out0)
+    <
+        float{sstr_w} param_w_conv0
+    >
+    {{
+        out0 = Conv<kernel_shape={sstr_k}, group=1, pads={sstr_p},
+                    strides={sstr_s}>(in0, param_w_conv0)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("in0", dt_in)
+    model.set_tensor_datatype("param_w_conv0", dt_w)
+    model.set_initializer("param_w_conv0", gen_finn_dt_tensor(dt_w, shape_w))
+    model = model.transform(InferShapes())
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(InferShapes())
+    return model
+
+
+@pytest.mark.parametrize("is_1d", [True, False])
+@pytest.mark.parametrize("flip_1d", [True, False])
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode):
+    if flip_1d and not is_1d:
+        pytest.skip("flip_1d only applicable for is_1d")
+    in_dim = 32
+    k = 1
+    stride = 2
+    dt_in = DataType["UINT8"]
+    dt_w = DataType["INT2"]
+    model = build_model(
+        is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d
+    )
+    inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0"))
+    idict = {"in0": inp}
+    y_expected = execute_onnx(model, idict)["out0"]
+    model = model.transform(to_hls.InferConvInpGen())
+    assert len(model.get_nodes_by_op_type("DownSampler")) == 1
+    if exec_mode == "cppsim":
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+    y_produced = execute_onnx(model, idict)["out0"]
+    assert (y_produced == y_expected).all()
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("DownSampler")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        # small adjustment for 2D testcase due to how rtlsim works:
+        # output is finished before all pixels are read, since last
+        # row is dropped (rtlsim finishes based on # of expected
+        # pixels)
+        if not is_1d:
+            exp_cycles = exp_cycles - in_dim
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 7ec254405d23f0a972de7f9d02d2ab021ed3d959..441bbce50a8a218185f93a7968767abe2541ed15 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -36,7 +36,7 @@ from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -76,7 +76,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl):
         nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi
     )
 
-    model = helper.make_model(graph, producer_name="addstreams-model")
+    model = qonnx_make_model(graph, producer_name="addstreams-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index bcf2a1fe3d304ac27a06b544825a84f5757830c9..2bde148a1499e4c7065ab1e151e3c4198e1e96da 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -32,19 +32,19 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 
-def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
+def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style):
 
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape)
-    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape)
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
 
     DWC_node = helper.make_node(
         "StreamingDataWidthConverter_Batch",
@@ -52,17 +52,18 @@ def make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype):
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
-        shape=Shape,
-        inWidth=INWidth,
-        outWidth=OUTWidth,
+        shape=shape,
+        inWidth=inWidth,
+        outWidth=outWidth,
         dataType=str(finn_dtype.name),
+        impl_style=impl_style,
     )
 
     graph = helper.make_graph(
         nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="dwc-model")
+    model = qonnx_make_model(graph, producer_name="dwc-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", finn_dtype)
@@ -75,34 +76,42 @@ def prepare_inputs(input_tensor, dt):
     return {"inp": input_tensor}
 
 
-# shape
-@pytest.mark.parametrize("Shape", [[1, 4], [1, 2, 8]])
-# inWidth
-@pytest.mark.parametrize("INWidth", [2, 4])
-# outWidth
-@pytest.mark.parametrize("OUTWidth", [2, 4])
-# finn_dtype
-@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]])
+@pytest.mark.parametrize(
+    "config",
+    [
+        ([1, 24], 6, 4, DataType["INT2"], "hls"),
+        ([1, 24], 4, 6, DataType["INT2"], "hls"),
+        ([1, 4], 2, 4, DataType["BIPOLAR"], "hls"),
+        ([1, 2, 8], 2, 4, DataType["BIPOLAR"], "hls"),
+        ([1, 4], 4, 2, DataType["INT2"], "hls"),
+        ([1, 2, 8], 4, 4, DataType["INT2"], "hls"),
+        ([1, 2, 8], 8, 16, DataType["INT2"], "vivado"),
+    ],
+)
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
-
+def test_fpgadataflow_dwc_rtlsim(config):
+    shape, inWidth, outWidth, finn_dtype, impl_style = config
+    test_fpga_part = "xc7z020clg400-1"
+    target_clk_ns = 10.0
     # generate input data
-    x = gen_finn_dt_tensor(finn_dtype, Shape)
+    x = gen_finn_dt_tensor(finn_dtype, shape)
     input_dict = prepare_inputs(x, finn_dtype)
 
-    model = make_single_dwc_modelwrapper(Shape, INWidth, OUTWidth, finn_dtype)
-
-    model = model.transform(SetExecMode("rtlsim"))
+    model = make_single_dwc_modelwrapper(
+        shape, inWidth, outWidth, finn_dtype, impl_style
+    )
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(PrepareIP(test_fpga_part, 5))
     model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model.set_metadata_prop("exec_mode", "rtlsim")
     y = oxe.execute_onnx(model, input_dict)["outp"]
 
     assert (
         y == x
     ).all(), """The output values are not the same as the
         input values anymore."""
-    assert y.shape == tuple(Shape), """The output shape is incorrect."""
+    assert y.shape == tuple(shape), """The output shape is incorrect."""
diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..6028a9b9f0fb4a04d0f53fd8c4fae3aac3ae686e
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx.parser as oprs
+import qonnx.core.data_layout as dl
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+def build_model(shp, dt0, dt1, do_abs):
+    np.random.seed(0)
+    shp_str = str(shp)
+    if do_abs:
+        graph = """
+        sub_out = Sub(in0, in1)
+        out0 = Abs(sub_out)
+        """
+    else:
+        graph = "out0 = Sub(in0, in1)"
+
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0, float{shp_str} in1) => (float{shp_str} out0)
+    {{
+        {graph}
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("in0", dt0)
+    model.set_tensor_datatype("in1", dt1)
+    model.set_tensor_layout("in0", dl.NHWC)
+    model.set_tensor_layout("in1", dl.NHWC)
+    model = model.transform(InferShapes())
+    return model
+
+
+# input datatype for one operand
+@pytest.mark.parametrize("dt0", [DataType["UINT4"], DataType["UINT7"]])
+# channels
+@pytest.mark.parametrize("ch", [1, 64])
+# folding
+@pytest.mark.parametrize("fold", [-1, 2, 1])
+# include Abs output node or not
+@pytest.mark.parametrize("do_abs", [True, False])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode):
+    if fold == -1:
+        pe = 1
+    else:
+        pe = max(1, ch // fold)
+    assert ch % pe == 0
+    dt1 = DataType["UINT8"]
+    shp = [1, 4, 2, ch]
+    model = build_model(shp, dt0, dt1, do_abs)
+    in0 = gen_finn_dt_tensor(dt0, shp)
+    in1 = gen_finn_dt_tensor(dt1, shp)
+    idict = {"in0": in0, "in1": in1}
+    y_expected = execute_onnx(model, idict)["out0"]
+    model = model.transform(to_hls.InferStreamingEltwise())
+    assert len(model.graph.node) == 1
+    assert model.graph.node[0].op_type == "StreamingEltwise"
+    getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe)
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+    else:
+        raise Exception("Unknown exec_mode")
+    y_produced = execute_onnx(model, idict)["out0"]
+    assert (y_produced == y_expected).all(), exec_mode + " failed"
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("StreamingEltwise")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0
diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py
index b9c74185d9f104e15355a5dd6021d7e74dac641e..efdb3bf6aaab23fec67055ae28b2e285f1a32b6a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fifo.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py
@@ -33,7 +33,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -66,7 +66,7 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype):
         nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fifo-model")
+    model = qonnx_make_model(graph, producer_name="fifo-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", finn_dtype)
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 2e2da0da7a217091d76d0a59a2a36a8e6a28af8e..b95409fda87718f30a74bad88697c3dbad0bf98f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -36,7 +36,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -53,12 +53,11 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_style):
+def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt):
     pad_h = padding[0] + padding[2]
     pad_w = padding[1] + padding[3]
     idim_h, idim_w = idim
 
-    assert pad_style == 2, "only pad_style == 2 supported in hlslib"
     assert pad_h > 0 or pad_w > 0, "Output dim should be greater than input dim"
     odim_h = idim_h + pad_h
     odim_w = idim_w + pad_w
@@ -71,7 +70,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
     )
 
     FMPadding = helper.make_node(
-        "FMPadding_Batch",
+        optype,
         ["inp"],
         ["outp"],
         domain="finn.custom_op.fpgadataflow",
@@ -80,7 +79,6 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
         Padding=padding,
         NumChannels=num_ch,
         inputDataType=str(idt.name),
-        PaddingStyle=pad_style,
         numInputVectors=1,
         SIMD=simd,
     )
@@ -89,7 +87,7 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
         nodes=[FMPadding], name="fmpadding_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fmpadding-model")
+    model = qonnx_make_model(graph, producer_name="fmpadding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -101,21 +99,25 @@ def make_single_fmpadding_modelwrapper(idim, padding, num_ch, simd, idt, pad_sty
 # input image dimension
 @pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
 # number of rows and number of cols to add
-@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3]])
+@pytest.mark.parametrize(
+    "pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]]
+)
 # number of channels
 @pytest.mark.parametrize("num_ch", [2, 4])
 # Input parallelism
 @pytest.mark.parametrize("simd", [1, 2])
-# PaddingStyle: selects behavior when (odim-idim)%2 != 0
-@pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
 @pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
 # execution mode
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
+# implementation style
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style):
+    if impl_style == "rtl" and mode == "cppsim":
+        pytest.skip("rtl implstyle has no cppsim, skipping")
     if num_ch % simd != 0:
         pytest.skip(" num_ch % simd != 0, skipping")
 
@@ -123,19 +125,15 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
     pad_h = pad[0] + pad[2]
     pad_w = pad[1] + pad[3]
 
-    if idim_h == idim_w and pad_h != pad_w:
-        pytest.skip(
-            """Only equal padding along the dimensions for square images
-            is supported, skipping"""
-        )
-
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim_h, idim_w, num_ch])
     input_dict = {"inp": x}
     odim_h = idim_h + pad_h
     odim_w = idim_w + pad_w
 
-    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt, pad_style)
+    optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style]
+
+    model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
@@ -146,36 +144,19 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, pad_style, idt, mode):
         model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
         model = model.transform(HLSSynthIP())
         model = model.transform(PrepareRTLSim())
+
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     expected_oshape = (1, odim_h, odim_w, num_ch)
     assert y_produced.shape == expected_oshape
 
-    # calculate reference
-    # calculate correct pad according to parameters
-    if pad_style == 2:
-        if pad_h % 2 == 0:
-            pad_up = pad_h // 2
-        else:
-            pad_up = pad_h // 2 + 1
-        if pad_w % 2 == 0:
-            pad_left = pad_w // 2
-        else:
-            pad_left = pad_w // 2 + 1
-    else:
-        pad_up = pad_h // 2
-        pad_left = pad_w // 2
-
-    pad_down = pad_h - pad_up
-    pad_right = pad_w - pad_left
-
     y_expected = np.pad(
-        x, ((0, 0), (pad_up, pad_down), (pad_left, pad_right), (0, 0)), "constant"
+        x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant"
     )
 
     assert (y_produced == y_expected).all()
 
     if mode == "rtlsim":
-        node = model.get_nodes_by_op_type("FMPadding_Batch")[0]
+        node = model.get_nodes_by_op_type(optype)[0]
         inst = getCustomOp(node)
         cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
         exp_cycles_dict = model.analysis(exp_cycles_per_layer)
diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
index a37e6e3271a9f7e033e6beaa6dbed01271365101..a2c3d09a55f81dc5e9d5ae1819cd8ea6b7df1e27 100644
--- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py
@@ -34,7 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -65,7 +65,7 @@ def make_accpool_modelwrapper(ch, pe, idim, idt):
         nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
index 80f2d724ad7ccbf563c23076155313bad1ecb336..b220338e6919e8eeaeef0f6e5343fed9b1dfca10 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py
@@ -36,7 +36,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 from finn.core.onnx_exec import execute_onnx
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -100,7 +100,7 @@ def create_one_fc_model(mem_mode="const"):
         nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -177,7 +177,7 @@ def create_two_fc_model(mem_mode="decoupled"):
         value_info=[mid],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -348,6 +348,7 @@ def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw):
         model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model"))
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(PrepareIP(fpga_part, period_ns))
+    model = model.transform(HLSSynthIP())
     model = model.transform(VitisBuild(fpga_part, period_ns, platform))
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_vitis.onnx")
     assert model.get_metadata_prop("platform") == "alveo"
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index a9b98ecaf80b4c86fc1e9ccec23e6d97c5982f55..553f263ba2e004233011db90feabea057d88026a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -33,7 +33,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
@@ -67,7 +67,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index d1895a12675dce69070d280381a9982060e20c21..b80ef76a19e487a93b23ae7db17350e85fb66822 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -36,12 +36,17 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from qonnx.util.basic import (
+    calculate_signed_dot_prod_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
@@ -105,7 +110,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -417,3 +422,67 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim(
     exp_cycles = exp_cycles_dict[node.name]
     assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
     assert exp_cycles != 0
+
+
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["decoupled", "const"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType["INT4"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["INT4"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [8])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [8])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [32])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [32])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+
+    # no activation, produce accumulators
+    T = None
+    tdt = None
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        odt = DataType["UINT32"]
+    else:
+        odt = DataType["INT32"]
+
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+    total_fold = nf * sf
+    exp_total_cycles = total_fold + 10
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+    model = model.transform(DeriveCharacteristic(exp_total_cycles))
+    node_inst = getCustomOp(model.graph.node[0])
+    period_attr = node_inst.get_nodeattr("io_chrc_period")
+    assert period_attr == exp_total_cycles
+    chrc_in = node_inst.get_nodeattr("io_chrc_in")
+    chrc_out = node_inst.get_nodeattr("io_chrc_out")
+    assert chrc_in.shape == (1, 2 * exp_total_cycles)
+    assert chrc_out.shape == (1, 2 * exp_total_cycles)
+    # first sf cycles should read input continuously
+    assert (chrc_in[0, :sf] == range(1, sf + 1)).all()
+    # all outputs should be produced within the exp n of cycles
+    assert chrc_out[0, exp_total_cycles] == nf
diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
index e3c79fa44fb57718d359b58d1a8716746f6668fb..b3cf7b4229c39f27c7f3689ef51fb7d22c7aa0f2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
+++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py
@@ -32,6 +32,7 @@ from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import qonnx_make_model
 
 from finn.analysis.fpgadataflow.res_estimation import (
     res_estimation,
@@ -87,7 +88,7 @@ def test_res_estimate():
         nodes=[FCLayer_node], name="fclayer_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
index a3968cf79704092ffb5ec53c887842372b625f4d..628721b429abadf198126a2f5801178f2f710033 100644
--- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
@@ -35,7 +35,7 @@ from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -74,7 +74,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_
         nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="mp-model")
+    model = qonnx_make_model(graph, producer_name="mp-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 706679b6809844d0b2924411440088ea892ba7a9..96cd69c3453793c1634f132cb159f0cc8a94a28c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -37,7 +37,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -93,7 +93,7 @@ def make_single_thresholding_modelwrapper(
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="thresholding-model")
+    model = qonnx_make_model(graph, producer_name="thresholding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
index d1ef0b890a66524b7cbd055a413561961ebcb4a7..a08d31f7b05184a4d5c84ef927a05fe1fd6e43c3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -30,6 +30,7 @@ import pytest
 
 import numpy as np
 import os
+import shutil
 import torch
 from brevitas.export import FINNManager
 from qonnx.core.datatype import DataType
@@ -51,6 +52,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.util.basic import make_build_dir
 
 tmpdir = os.environ["FINN_BUILD_DIR"]
 
@@ -117,7 +119,7 @@ class PyTorchTestModel(nn.Module):
 
 # param datatype
 @pytest.mark.parametrize("dt", [DataType["INT8"]])
-# Width/height of square input feature map
+# spatial dim input feature map
 @pytest.mark.parametrize("IFMDim", [3, 5])
 # upscaling factor
 @pytest.mark.parametrize("scale", [2, 3])
@@ -125,14 +127,22 @@ class PyTorchTestModel(nn.Module):
 @pytest.mark.parametrize("NumChannels", [4])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# whether to use 1D or 2D square testcases
+@pytest.mark.parametrize("is_1d", [False, True])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
+def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d):
+    tmpdir = make_build_dir("upsample_export_")
     atol = 1e-3
+    if is_1d:
+        input_shape = (1, NumChannels, IFMDim, 1)
+        upscale_factor = (scale, 1)
+    else:
+        input_shape = (1, NumChannels, IFMDim, IFMDim)
+        upscale_factor = (scale, scale)
     # Create the test model and inputs for it
-    torch_model = PyTorchTestModel(upscale_factor=scale)
-    input_shape = (1, NumChannels, IFMDim, IFMDim)
+    torch_model = PyTorchTestModel(upscale_factor=upscale_factor)
     test_in = torch.arange(0, np.prod(np.asarray(input_shape)))
     # Limit the input to values valid for the given datatype
     test_in %= dt.max() - dt.min() + 1
@@ -200,3 +210,4 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode):
         assert output_matches, "Cppsim output doesn't match ONNX/PyTorch."
     elif exec_mode == "rtlsim":
         assert output_matches, "Rtlsim output doesn't match ONNX/PyTorch."
+    shutil.rmtree(tmpdir, ignore_errors=True)
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index c48448787d8a3bb926c1e94850be6e99e8c106d3..abf8ba0b9efde67c77711abc8451475887430cae 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -35,7 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.multithreshold import multithreshold
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -75,7 +75,19 @@ def _calculate_dot_prod_range(dt_a, dt_b, len):
 
 
 def _make_single_vvau_modelwrapper(
-    W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T=None, tdt=None
+    W,
+    pe,
+    k_h,
+    k_w,
+    channels,
+    dim_h,
+    dim_w,
+    wdt,
+    idt,
+    odt,
+    T=None,
+    tdt=None,
+    mem_mode="const",
 ):
     in_shape = [1, dim_h, dim_w, k_h * k_w * channels]  # [N, H, W, K*K*CH]
     out_shape = [
@@ -113,13 +125,14 @@ def _make_single_vvau_modelwrapper(
         weightDataType=wdt.name,
         outputDataType=odt.name,
         noActivation=no_act,
+        mem_mode=mem_mode,
     )
 
     graph = helper.make_graph(
         nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="vvau-model")
+    model = qonnx_make_model(graph, producer_name="vvau-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -140,7 +153,7 @@ def prepare_inputs(input_tensor):
     return {"inp": input_tensor}
 
 
-# mem_mode: const or decoupled
+# input datatype
 @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]])
 # weight datatype
 @pytest.mark.parametrize("wdt", [DataType["INT4"]])
@@ -156,13 +169,15 @@ def prepare_inputs(input_tensor):
 @pytest.mark.parametrize("k_w", [3, 1])
 # Number of input and output channels
 @pytest.mark.parametrize("channels", [3, 4])
+# memory mode
+@pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
 def test_fpgadataflow_vvau(
-    idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode
+    idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode
 ):
     if pe == "channels":
         pe = channels
@@ -198,7 +213,7 @@ def test_fpgadataflow_vvau(
         tdt = DataType["INT32"]
 
     model = _make_single_vvau_modelwrapper(
-        W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt
+        W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
     )
 
     if exec_mode == "cppsim":
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
index 8ea0e18f2cace10b6fefae50ce1e28845ab24050..5355dd7044343d9dbb077225b5b8786eb7fdfe32 100644
--- a/tests/fpgadataflow/test_set_folding.py
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -34,6 +34,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import qonnx_make_model
 
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
@@ -91,7 +92,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
         outputs=[tensors[-1]],
     )
 
-    model = helper.make_model(graph, producer_name="fclayer-model")
+    model = qonnx_make_model(graph, producer_name="fclayer-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", adt)
diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py
new file mode 100644
index 0000000000000000000000000000000000000000..85b4a2bfa8dc0de3cbdd0ca34ec5b1ee68f37acf
--- /dev/null
+++ b/tests/fpgadataflow/test_split_large_fifos.py
@@ -0,0 +1,128 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import pytest
+
+import json
+import shutil
+from brevitas.export.onnx.generic.manager import BrevitasONNXManager
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.transformation.fpgadataflow.set_fifo_depths import get_fifo_split_configs
+from finn.util.basic import make_build_dir
+from finn.util.test import get_trained_network_and_ishape
+
+
+def fetch_test_model(topology, wbits=2, abits=2):
+    tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
+    (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits)
+    chkpt_name = tmp_output_dir + "/model.onnx"
+    BrevitasONNXManager.export(model, ishape, chkpt_name)
+    return tmp_output_dir
+
+
+def get_folding_cfg(depth=65536):
+    cfg = dict()
+    cfg["Defaults"] = dict()
+    for i in range(3):
+        key = "StreamingFIFO_" + str(i)
+        cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"}
+    return cfg
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize("depth", [16384, 65536, 45000])
+@pytest.mark.parametrize("force_python_rtlsim", ["True", "False"])
+def test_split_large_fifos(depth, force_python_rtlsim):
+    tmp_output_dir = fetch_test_model("tfc")
+    folding_cfg = get_folding_cfg(depth)
+    with open(tmp_output_dir + "/folding_config.json", "w") as f:
+        json.dump(folding_cfg, f, indent=2)
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=False,
+        split_large_fifos=True,
+        folding_config_file=tmp_output_dir + "/folding_config.json",
+        target_fps=10000,
+        force_python_rtlsim=force_python_rtlsim,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        rtlsim_batch_size=100,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+        default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED,
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+    assert (
+        float(sim_data["throughput[images/s]"])
+        / float(est_data["estimated_throughput_fps"])
+        > 0.9
+    )
+    model = ModelWrapper(
+        tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx"
+    )
+    # exclude final FIFO node (output FIFO, not part of test)
+    fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")[:-1]
+    golden_cfg = get_fifo_split_configs(depth, 256, 32768)
+    for i, fifo_node in enumerate(fifo_nodes):
+        inst = getCustomOp(fifo_node)
+        fifo_depth = inst.get_nodeattr("depth")
+        assert fifo_depth == golden_cfg[i % len(golden_cfg)][0]
+
+    shutil.rmtree(tmp_output_dir)
+
+
+def test_split_large_fifo_configs():
+    ret0 = get_fifo_split_configs(513, 256, 32768)
+    assert ret0 == [(512, "vivado"), (1, "rtl")]
+    ret1 = get_fifo_split_configs(1200, 256, 32768)
+    assert ret1 == [(1024, "vivado"), (176, "rtl")]
+    ret2 = get_fifo_split_configs(45000, 256, 32768)
+    assert ret2 == [
+        (32768, "vivado"),
+        (8192, "vivado"),
+        (2048, "vivado"),
+        (1024, "vivado"),
+        (512, "vivado"),
+        (256, "rtl"),
+        (200, "rtl"),
+    ]
diff --git a/tests/transformation/streamline/test_absorb_mul_into_topk.py b/tests/transformation/streamline/test_absorb_mul_into_topk.py
index a6dff788dc58dba45536a280c7fe5f5c53edc4e1..89ef74e0b3f83fc092268ad2582c533e47eab618 100644
--- a/tests/transformation/streamline/test_absorb_mul_into_topk.py
+++ b/tests/transformation/streamline/test_absorb_mul_into_topk.py
@@ -34,6 +34,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.absorb import AbsorbScalarMulAddIntoTopK
@@ -65,7 +66,7 @@ def test_absorb_mul_into_topk(mul_positive, scalar):
         value_info=[a0, b0, c0],
     )
 
-    model = helper.make_model(mul_graph, producer_name="mul_model")
+    model = qonnx_make_model(mul_graph, producer_name="mul_model")
     model = ModelWrapper(model)
     # initialize values
     # for mul
diff --git a/tests/transformation/streamline/test_absorb_opposite_transposes.py b/tests/transformation/streamline/test_absorb_opposite_transposes.py
index 51ea5edfc420bf935de3e196df1b150934782a91..6d8d2b9f0cd4ad28c3ea0922d69b9b963a0deb08 100644
--- a/tests/transformation/streamline/test_absorb_opposite_transposes.py
+++ b/tests/transformation/streamline/test_absorb_opposite_transposes.py
@@ -29,8 +29,7 @@
 import pytest
 
 import numpy as np
-import onnx.helper as oh
-from onnx import TensorProto
+import onnx.parser as oprs
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
 
@@ -41,39 +40,42 @@ from finn.transformation.streamline.absorb import AbsorbConsecutiveTransposes
 @pytest.mark.streamline
 def test_absorb_opposite_transposes():
     np.random.seed(0)
-    input_shape = [1, 3, 4, 2]
-    top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
-    top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
-    value_info = [oh.make_tensor_value_info("add_param_0", TensorProto.FLOAT, [1])]
-    value_info += [oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [1])]
-    value_info += [oh.make_tensor_value_info("mul_param_0", TensorProto.FLOAT, [1])]
-    modelproto = oh.make_model(
-        oh.make_graph(
-            name="test",
-            inputs=[top_in],
-            outputs=[top_out],
-            value_info=value_info,
-            nodes=[
-                oh.make_node("Add", ["top_in", "add_param_0"], ["t0"]),
-                oh.make_node("Transpose", ["t0"], ["t1"], perm=[0, 2, 3, 1]),
-                oh.make_node("Transpose", ["t1"], ["t2"], perm=[0, 3, 1, 2]),
-                oh.make_node("Add", ["t2", "add_param_1"], ["t3"]),
-                oh.make_node("Transpose", ["t3"], ["t4"], perm=[0, 2, 3, 1]),
-                oh.make_node("Transpose", ["t4"], ["t5"], perm=[0, 3, 1, 2]),
-                oh.make_node("Add", ["t5", "t2"], ["t6"]),
-                oh.make_node("Mul", ["t6", "mul_param_0"], ["top_out"]),
-            ],
-        )
-    )
-    model = ModelWrapper(modelproto)
+    shp = [1, 3, 4, 2]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    <
+        float[1] add0_param = {{1.0}},
+        float[1] add1_param = {{3.0}},
+        float[1] mul0_param = {{2.0}}
+    >
+    {{
+        add0_out = Add(in0, add0_param)
+        t0_out = Transpose<perm=[0,2,3,1]>(add0_out)
+        t1_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        add1_out = Add(t1_out, add1_param)
+        t2_out = Transpose<perm=[0,2,3,1]>(add1_out)
+        t3_out = Transpose<perm=[0,3,1,2]>(t2_out)
+        add2_out = Add(t1_out, t3_out)
+        t4_out = Transpose<perm=[0,2,3,1]>(add2_out)
+        t5_out = Transpose<perm=[0,3,1,2]>(t4_out)
+        t6_out = Transpose<perm=[0,3,1,2]>(t4_out)
+        m0_out = Mul(t5_out, mul0_param)
+        m1_out = Mul(t6_out, mul0_param)
+        out0 = Mul(m0_out, m1_out)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
     model = model.transform(InferShapes())
-    model.set_initializer("add_param_0", np.asarray([1], dtype=np.float32))
-    model.set_initializer("add_param_1", np.asarray([3], dtype=np.float32))
-    model.set_initializer("mul_param_0", np.asarray([2], dtype=np.float32))
     new_model = model.transform(AbsorbConsecutiveTransposes())
     new_model = new_model.transform(InferShapes())
-    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
+    inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)}
     assert ox.compare_execution(model, model, inp_dict)
-    assert len(new_model.graph.node) == 4
+    assert len(new_model.graph.node) == 6
     for n in new_model.graph.node:
         assert new_model.graph.node[0].op_type != "Transpose"
diff --git a/tests/transformation/streamline/test_absorb_transp_into_flatten.py b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
index 1358d468c04c3edf08b11e7e9b858dda58965368..44b0c1d7e04447f13043cb326047a7b8d69469dd 100644
--- a/tests/transformation/streamline/test_absorb_transp_into_flatten.py
+++ b/tests/transformation/streamline/test_absorb_transp_into_flatten.py
@@ -8,6 +8,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.absorb import AbsorbTransposeIntoFlatten
@@ -45,7 +46,7 @@ def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="absorb_transpose_model")
+    model = qonnx_make_model(graph, producer_name="absorb_transpose_model")
     model = ModelWrapper(model)
     if shape is not None:
         model.graph.value_info.append(shape0)
diff --git a/tests/transformation/streamline/test_collapse_repeated_op.py b/tests/transformation/streamline/test_collapse_repeated_op.py
index 268e0ffc5c5cb342634ff51ac8fe02157ae8c7c6..c1d3ee00883b84ec2a8c18d093b1756a4d6aea36 100644
--- a/tests/transformation/streamline/test_collapse_repeated_op.py
+++ b/tests/transformation/streamline/test_collapse_repeated_op.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import CollapseRepeatedAdd, CollapseRepeatedMul
@@ -46,7 +47,7 @@ def test_collapse_repeated_op():
     add_param_1 = oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [2])
     mul_param_1 = oh.make_tensor_value_info("mul_param_1", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -96,7 +97,7 @@ def test_collapse_repeated_only_if_linear(test_args):
     value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, [1])]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
index 04ab9bf0b9c092bdf2c2a6c6268974fd78020eee..89596a1c0f4af4b95e19f3b6aba19e7f459aa7df 100644
--- a/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
+++ b/tests/transformation/streamline/test_factor_out_mul_sign_magnitude.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import FactorOutMulSignMagnitude
@@ -43,7 +44,7 @@ def test_factor_out_mul_sign_magnitude():
     top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, 2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_linear_past_eltwise.py b/tests/transformation/streamline/test_linear_past_eltwise.py
index 12633d750bb405757efca0c028dece92b289b472..4e5dcd63862b61f5575d8adf2cbb69912ee726d7 100644
--- a/tests/transformation/streamline/test_linear_past_eltwise.py
+++ b/tests/transformation/streamline/test_linear_past_eltwise.py
@@ -35,6 +35,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
@@ -78,7 +79,7 @@ def make_model(shape):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="add-model")
+    model = qonnx_make_model(graph, producer_name="add-model")
     model = ModelWrapper(model)
 
     # set initializers for scalar add/mul nodes
@@ -156,7 +157,7 @@ def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim):
             helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
         ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_maxpool_nhwc.py b/tests/transformation/streamline/test_maxpool_nhwc.py
index aa77b5cf1a6e77d67ff8351ca5f544a63eb47f29..d61eedaaf5d1f10e64712d5282190b67f56acb49 100644
--- a/tests/transformation/streamline/test_maxpool_nhwc.py
+++ b/tests/transformation/streamline/test_maxpool_nhwc.py
@@ -7,7 +7,7 @@ from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
@@ -56,7 +56,7 @@ def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt)
         value_info=[outp_mp],
     )
 
-    model = oh.make_model(graph, producer_name="maxpool_model")
+    model = qonnx_make_model(graph, producer_name="maxpool_model")
     model = ModelWrapper(model)
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", idt)
diff --git a/tests/transformation/streamline/test_move_add_past_mul.py b/tests/transformation/streamline/test_move_add_past_mul.py
index 0fb4dd9f7a116d0d52578d7222421f251ac17ec1..ea9c2a954d2bd7b4a4be421c1869d4a8dd8f0cf1 100644
--- a/tests/transformation/streamline/test_move_add_past_mul.py
+++ b/tests/transformation/streamline/test_move_add_past_mul.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import MoveAddPastMul
@@ -44,7 +45,7 @@ def test_move_add_past_mul_single():
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [2])
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -76,7 +77,7 @@ def test_move_add_past_mul_multi():
     add_param_1 = oh.make_tensor_value_info("add_param_1", TensorProto.FLOAT, [2])
     mul_param_1 = oh.make_tensor_value_info("mul_param_1", TensorProto.FLOAT, [2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -116,7 +117,7 @@ def test_move_add_past_mul_only_if_linear():
     value_info += [oh.make_tensor_value_info("mul1_param", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("mul2_param", TensorProto.FLOAT, [1])]
     value_info += [oh.make_tensor_value_info("mul3_param", TensorProto.FLOAT, [1])]
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_chw_add_past_conv.py b/tests/transformation/streamline/test_move_chw_add_past_conv.py
index 7eb7f9f1af67efa1a6934157b9c2b3f8a6a814c2..e1b324a798a23b5f4a6878f5e2b27434a61fe8f8 100644
--- a/tests/transformation/streamline/test_move_chw_add_past_conv.py
+++ b/tests/transformation/streamline/test_move_chw_add_past_conv.py
@@ -33,6 +33,7 @@ from onnx import TensorProto, helper
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveAddPastConv
@@ -72,7 +73,7 @@ def test_move_chw_add_past_conv(idim, k, s, ich, och):
     add_node = helper.make_node("Add", ["inp", "a0"], ["add_out"])
     conv_node = helper.make_node("Conv", ["add_out", "a1"], ["outp"], **conv_config)
 
-    model = helper.make_model(
+    model = qonnx_make_model(
         helper.make_graph(
             nodes=[add_node, conv_node],
             name="move-add-graph",
diff --git a/tests/transformation/streamline/test_move_flatten_past_affine.py b/tests/transformation/streamline/test_move_flatten_past_affine.py
index 8c3f71d1f35de1b03fb33e53e41599fae7e02304..22c5e19fac700e147a36f74f10dad10614d47992 100644
--- a/tests/transformation/streamline/test_move_flatten_past_affine.py
+++ b/tests/transformation/streamline/test_move_flatten_past_affine.py
@@ -36,7 +36,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveFlattenPastAffine
@@ -74,7 +74,7 @@ def test_move_flatten_past_affine(data_layout, batch_size):
         value_info=[a0, a1, a2],
     )
 
-    model = helper.make_model(graph, producer_name="move_reshape_model")
+    model = qonnx_make_model(graph, producer_name="move_reshape_model")
     model = ModelWrapper(model)
 
     # initialize values
diff --git a/tests/transformation/streamline/test_move_flatten_past_topk.py b/tests/transformation/streamline/test_move_flatten_past_topk.py
index 83d7a28c05fbd95834e5d84ab7537ae82c285d17..82336cd3e69d865e4c36536e7e0b16f092a7033d 100644
--- a/tests/transformation/streamline/test_move_flatten_past_topk.py
+++ b/tests/transformation/streamline/test_move_flatten_past_topk.py
@@ -36,7 +36,7 @@ from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.insert_topk import InsertTopK
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveFlattenPastTopK
@@ -47,7 +47,7 @@ from finn.transformation.streamline.reorder import MoveFlattenPastTopK
 @pytest.mark.parametrize("data_layout", [DataLayout.NHWC, DataLayout.NCHW])
 # batch size
 @pytest.mark.parametrize("batch_size", [1, 2])
-def test_move_flatten_past_affine(data_layout, batch_size):
+def test_move_flatten_past_topk(data_layout, batch_size):
     if data_layout == DataLayout.NHWC:
         ishape = [batch_size, 1, 1, 1024]
         oshape = [batch_size, 1024]
@@ -67,7 +67,7 @@ def test_move_flatten_past_affine(data_layout, batch_size):
         outputs=[outp],
     )
 
-    model = helper.make_model(graph, producer_name="move_flatten_model")
+    model = qonnx_make_model(graph, producer_name="move_flatten_model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", DataType["INT2"])
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
index 4986363ff4dba0b0126babdbd1f393faa2df5de3..7be97631625354297c322267792520628454c4f9 100644
--- a/tests/transformation/streamline/test_move_identical_op_past_join_op.py
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
@@ -30,7 +30,7 @@ import pytest
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveTransposePastJoinAdd
@@ -81,7 +81,7 @@ def create_model(perm):
         ],
     )
 
-    onnx_model = oh.make_model(graph, producer_name="test_model")
+    onnx_model = qonnx_make_model(graph, producer_name="test_model")
     model = ModelWrapper(onnx_model)
 
     return model
diff --git a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
index bf25eee9e685d2536faf5bd25bc7b1aa36700463..6126acd9e388869c34cd0c73bb64f4b6c56b4c06 100644
--- a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
+++ b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py
@@ -32,6 +32,7 @@ from onnx import TensorProto, helper
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMaxPoolPastMultiThreshold
@@ -99,7 +100,7 @@ def test_move_maxpool_past_multithreshold():
         )
     ]
 
-    modelproto = helper.make_model(
+    modelproto = qonnx_make_model(
         helper.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
index 401631a728412e7676fa804626601cfc58b5a5e3..72a6650ec4e6b853b79c93941af84dd15a7e5c47 100644
--- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py
+++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
@@ -33,7 +33,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMulPastDWConv
@@ -94,7 +94,7 @@ def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
         value_info=[mul, W],
     )
 
-    model = helper.make_model(graph, producer_name="mulpastconv-model")
+    model = qonnx_make_model(graph, producer_name="mulpastconv-model")
     model = ModelWrapper(model)
     inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
     mul_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, 1, 1])
diff --git a/tests/transformation/streamline/test_move_mul_past_maxpool.py b/tests/transformation/streamline/test_move_mul_past_maxpool.py
index fcc1b6513230c548bdcc04a40aad793b64c6faf2..3bae2905a064b8372b520a7a8083905284343429 100755
--- a/tests/transformation/streamline/test_move_mul_past_maxpool.py
+++ b/tests/transformation/streamline/test_move_mul_past_maxpool.py
@@ -34,7 +34,7 @@ from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveMulPastMaxPool
@@ -92,7 +92,7 @@ def test_move_mul_past_maxpool(ifm_dim, ifm_ch, k, stride, pad, cw, negative):
         value_info=[mul],
     )
 
-    model = helper.make_model(graph, producer_name="mulpastmaxpool-model")
+    model = qonnx_make_model(graph, producer_name="mulpastmaxpool-model")
     model = ModelWrapper(model)
     inp_values = gen_finn_dt_tensor(DataType["INT2"], [1, ifm_ch, ifm_dim, ifm_dim])
     mul_values = np.random.random_sample(mul_shape).astype(np.float32)
diff --git a/tests/transformation/streamline/test_move_past_fork.py b/tests/transformation/streamline/test_move_past_fork.py
index 5064fa3fca869a245c87cf0c1680d1357e5de60b..7e77d7f9b3502429f08c40558e330b6261d0dbad 100644
--- a/tests/transformation/streamline/test_move_past_fork.py
+++ b/tests/transformation/streamline/test_move_past_fork.py
@@ -28,80 +28,113 @@
 import pytest
 
 import numpy as np
-from onnx import TensorProto, helper
+import onnx.parser as oprs
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
 
 import finn.core.onnx_exec as oxe
-from finn.transformation.streamline.reorder import MoveLinearPastFork
+from finn.transformation.streamline.reorder import (
+    MoveLinearPastFork,
+    MoveTransposePastFork,
+)
+
+
+@pytest.mark.streamline
+def test_move_past_fork_transpose():
+    shp = [1, 3, 32, 32]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    {{
+        t0_out = Transpose<perm=[0,2,3,1]>(in0)
+        t1_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        t2_out = Transpose<perm=[0,3,1,2]>(t0_out)
+        out0 = Add(t1_out, t2_out)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    new_model = model.transform(MoveTransposePastFork())
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    nodes = new_model.graph.node
+    assert oxe.compare_execution(
+        model, new_model, {"in0": np.random.rand(*shp).astype(np.float32)}
+    )
+    assert len(nodes) == 5
+    assert not new_model.is_fork_node(get_by_name(nodes, "Transpose_0"))
 
 
 @pytest.mark.streamline
 @pytest.mark.parametrize("ch", [64, 1])
 # ifmdim
 @pytest.mark.parametrize("ifmdim", [-1, 7])
-def test_move_past_fork(ch, ifmdim):
-    # generate test vectors of correct shape
+def test_move_past_fork_linear(ch, ifmdim):
     if ifmdim == -1:
-        input_shape = (1, ch)
+        shp = [1, ch]
     else:
-        input_shape = (1, ch, ifmdim, ifmdim)
+        shp = [1, ch, ifmdim, ifmdim]
+    shp_str = str(shp)
+    input = f"""
+    <
+        ir_version: 7,
+        opset_import: ["" : 9]
+    >
+    agraph (float{shp_str} in0) => (float{shp_str} out0)
+    <
+        float{shp_str} add0_param,
+        float{shp_str} mul_shared_param,
+        float{shp_str} add2_param,
+        float{shp_str} mul2_param,
+        float{shp_str} add3_param,
+        float{shp_str} add4_param,
+        float{shp_str} mul3_param,
+        float{shp_str} add6_param
+    >
+    {{
 
-    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
-    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
-
-    num_of_params = 8
-    value_info = []
-    for i in range(num_of_params):
-        value_info += [
-            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
-        ]
-
-    add_1_to_move = helper.make_node("Add", ["top_in", "p0"], ["fork1"])
-    mul_1_to_move = helper.make_node("Mul", ["t5", "p4"], ["fork2"])
-    add_2_to_move = helper.make_node("Add", ["fork2", "p5"], ["t6"])
-    mul_1_not_to_move = helper.make_node("Mul", ["t8", "p7"], ["fork3"])
-    modelproto = helper.make_model(
-        helper.make_graph(
-            name="test",
-            inputs=[top_in],
-            outputs=[top_out],
-            value_info=value_info,
-            nodes=[
-                # fork1
-                add_1_to_move,
-                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
-                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
-                helper.make_node("Add", ["t2", "t3"], ["t4"]),
-                helper.make_node("Add", ["t4", "p3"], ["t5"]),
-                # fork2
-                mul_1_to_move,
-                add_2_to_move,
-                helper.make_node("Add", ["fork2", "p6"], ["t7"]),
-                helper.make_node("Add", ["t6", "t7"], ["t8"]),
-                # empty branches: do nothing
-                mul_1_not_to_move,
-                helper.make_node("Add", ["fork3", "fork3"], ["top_out"]),
-            ],
-        )
-    )
-    model = ModelWrapper(modelproto)
+        add0_out = Add(in0, add0_param)
+        mul0_out = Mul(add0_out, mul_shared_param)
+        mul1_out = Mul(add0_out, mul_shared_param)
+        add1_out = Add(mul0_out, mul1_out)
+        add2_out = Add(add1_out, add2_param)
+        mul2_out = Mul(add2_out, mul2_param)
+        add3_out = Add(mul2_out, add3_param)
+        add4_out = Add(mul2_out, add4_param)
+        add5_out = Add(add3_out, add4_out)
+        mul3_out = Mul(add5_out, mul3_param)
+        out0 = Add(mul3_out, add6_param)
+    }}
+    """
+    model = oprs.parse_model(input)
+    model = ModelWrapper(model)
     model = model.transform(InferShapes())
 
     np.random.seed(0)
-    for i in range(num_of_params):
-        model.set_initializer(
-            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
-        )
-
+    for tensor_name in model.get_all_tensor_names():
+        if tensor_name.endswith("_param"):
+            pshape = model.get_tensor_shape(tensor_name)
+            model.set_initializer(
+                tensor_name, np.random.rand(*pshape).astype(np.float32)
+            )
+    model = model.transform(GiveUniqueNodeNames())
     # Transform
     new_model = model.transform(MoveLinearPastFork())
-    inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)}
-
+    new_model = new_model.transform(GiveUniqueNodeNames())
+    inp_dict = {"top_in": np.random.rand(*shp).astype(np.float32)}
     # Test
     assert oxe.compare_execution(model, new_model, inp_dict)
-    assert not new_model.is_fork_node(add_1_to_move)
-    assert not new_model.is_fork_node(mul_1_to_move)
-    assert not new_model.is_fork_node(add_2_to_move)
-    assert new_model.is_fork_node(mul_1_not_to_move)
+    nodes = new_model.graph.node
+    assert len(new_model.get_nodes_by_op_type("Add")) == 9
+    assert len(new_model.get_nodes_by_op_type("Mul")) == 5
+    assert not new_model.is_fork_node(get_by_name(nodes, "Add_0"))
+    assert new_model.is_join_node(get_by_name(nodes, "Add_2"))
+    assert not new_model.is_fork_node(get_by_name(nodes, "Mul_2"))
+    assert not new_model.is_join_node(get_by_name(nodes, "Add_5"))
     assert len(new_model.graph.node) == 14
diff --git a/tests/transformation/streamline/test_move_scalar_past_conv.py b/tests/transformation/streamline/test_move_scalar_past_conv.py
index 59b8b8f8b2fee99bbb77c6d354620406a108cb54..bb99fd1d8f7d48ab9ad7038d78f5352f26f2ad06 100644
--- a/tests/transformation/streamline/test_move_scalar_past_conv.py
+++ b/tests/transformation/streamline/test_move_scalar_past_conv.py
@@ -32,6 +32,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import MoveAddPastConv, MoveScalarMulPastConv
@@ -79,7 +80,7 @@ def test_move_scalar_past_conv(test_args, padding):
     value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)]
     value_info += [oh.make_tensor_value_info("p3", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -158,7 +159,7 @@ def test_move_scalar_past_conv_only_if_linear(test_args):
     value_info += [oh.make_tensor_value_info("p4", TensorProto.FLOAT, conv_param_shape)]
     value_info += [oh.make_tensor_value_info("p5", TensorProto.FLOAT, conv_param_shape)]
 
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_scalar_past_matmul.py b/tests/transformation/streamline/test_move_scalar_past_matmul.py
index 6fdaaadfaea5862b566fd3a8d060ac28acadf1cd..6c788294bc739332c0b9bd0e98081bcb83330b53 100644
--- a/tests/transformation/streamline/test_move_scalar_past_matmul.py
+++ b/tests/transformation/streamline/test_move_scalar_past_matmul.py
@@ -33,6 +33,7 @@ import onnx.helper as oh
 from onnx import TensorProto
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as ox
 from finn.transformation.streamline import (
@@ -47,7 +48,7 @@ def test_move_scalar_mul_past_matmul():
     mul_param = oh.make_tensor_value_info("mul_param", TensorProto.FLOAT, [1, 1])
     matmul_param = oh.make_tensor_value_info("matmul_param", TensorProto.FLOAT, [2, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -79,7 +80,7 @@ def test_move_scalar_add_past_matmul():
     add_param = oh.make_tensor_value_info("add_param", TensorProto.FLOAT, [1, 1])
     matmul_param = oh.make_tensor_value_info("matmul_param", TensorProto.FLOAT, [2, 2])
     top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, 2])
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
@@ -122,7 +123,7 @@ def test_move_scalar_past_matmul_only_if_linear(test_args):
     p2 = oh.make_tensor_value_info("p2", TensorProto.FLOAT, matmul_shape)
     p3 = oh.make_tensor_value_info("p3", TensorProto.FLOAT, matmul_shape)
     p4 = oh.make_tensor_value_info("p4", TensorProto.FLOAT, matmul_shape)
-    modelproto = oh.make_model(
+    modelproto = qonnx_make_model(
         oh.make_graph(
             name="test",
             inputs=[top_in],
diff --git a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
index 9662ba8a908e9bb793e0c0c2b078cf26adb5cef3..6bf72961ac06331c8ce972c8ca78dea99fb3c0a0 100644
--- a/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
+++ b/tests/transformation/streamline/test_move_transpose_past_scalar_mul.py
@@ -36,6 +36,7 @@ from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNode
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline.reorder import MoveTransposePastScalarMul
@@ -71,7 +72,7 @@ def test_move_transpose_past_scalar_mul(perm, scalar, data_layout):
         value_info=[a0],
     )
 
-    model = helper.make_model(graph, producer_name="mv_transpose_model")
+    model = qonnx_make_model(graph, producer_name="mv_transpose_model")
     model = ModelWrapper(model)
 
     # initialize values
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 1ec5f02e878a540a89cc37179b2e6dd76ede882c..85c60b37d5193de7ed2f38b9da6eb2e9b35b0150 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -32,6 +32,7 @@ import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.util.basic import qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline import RoundAndClipThresholds
@@ -46,7 +47,7 @@ def test_round_thresholds():
         "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
     )
     graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
-    model_def = helper.make_model(graph_def)
+    model_def = qonnx_make_model(graph_def)
     model = ModelWrapper(model_def)
     threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
     model.set_initializer("thresholds", threshold_val)
diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e107448f8d8cc78d572f846496ed541591dfe05
--- /dev/null
+++ b/tests/transformation/streamline/test_scale_resize_nhwc.py
@@ -0,0 +1,293 @@
+import pytest
+
+import numpy as np
+import onnx
+import onnx.helper as oh
+import qonnx.core.data_layout as DataLayout
+from onnx import TensorProto
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.core.onnx_exec as oxe
+from finn.transformation.streamline.reorder import MakeScaleResizeNHWC
+
+
+def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4])
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp_up = oh.make_tensor_value_info(
+        "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["inp", "roi", "scales"],
+        outputs=["outp_up"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["outp_up"],
+        outputs=["outp"],
+        name="Transpose1",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[resize_node, transpose_node],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_up, param, roi],
+    )
+
+    model = qonnx_make_model(graph, producer_name="resize_model1")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+
+    model.set_tensor_layout("inp", DataLayout.NCHW)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch]
+    )
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4])
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp_tr = oh.make_tensor_value_info(
+        "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    transpose_node = onnx.helper.make_node(
+        "Transpose",
+        inputs=["inp"],
+        outputs=["outp_tr"],
+        name="Transpose1",
+        perm=[0, 3, 1, 2],
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["outp_tr", "roi", "scales"],
+        outputs=["outp"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    graph = oh.make_graph(
+        nodes=[transpose_node, resize_node],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_tr, param, roi],
+    )
+
+    model = qonnx_make_model(graph, producer_name="resize_model2")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_layout("inp", DataLayout.NHWC)
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt):
+    ofm_dim_h = ifm_dim[0] * scales[2]
+    ofm_dim_w = ifm_dim[1] * scales[3]
+    inp = oh.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch]
+    )
+
+    param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, scales)
+
+    # Not actually used, only needed for compliance with the Resize node interface
+    roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4])
+
+    outp_tr = oh.make_tensor_value_info(
+        "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]
+    )
+
+    outp_up = oh.make_tensor_value_info(
+        "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]
+    )
+    outp = oh.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]
+    )
+
+    transpose_node1 = onnx.helper.make_node(
+        "Transpose",
+        inputs=["inp"],
+        outputs=["outp_tr"],
+        name="Transpose1",
+        perm=[0, 3, 1, 2],
+    )
+
+    resize_node = oh.make_node(
+        "Resize",
+        inputs=["outp_tr", "roi", "scales"],
+        outputs=["outp_up"],
+        name="Resize1",
+        mode=mode,
+    )
+
+    transpose_node2 = onnx.helper.make_node(
+        "Transpose",
+        inputs=["outp_up"],
+        outputs=["outp"],
+        name="Transpose2",
+        perm=[0, 2, 3, 1],
+    )
+
+    graph = oh.make_graph(
+        nodes=[transpose_node1, resize_node, transpose_node2],
+        name="resize_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[outp_up, outp_tr, param, roi],
+    )
+
+    model = qonnx_make_model(graph, producer_name="resize_model3")
+    model = ModelWrapper(model)
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", idt)
+    model.set_tensor_layout("inp", DataLayout.NHWC)
+
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+
+    return model
+
+
+def check_transform(model):
+    graph = model.graph
+    node_ind = 0
+    for n in graph.node:
+        node_ind += 1
+        if n.op_type == "Upsample" or n.op_type == "Resize":
+            if model.get_tensor_layout(n.output[0]) == DataLayout.NHWC:
+                return True
+    return False
+
+
+@pytest.mark.streamline
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[2**i, 2**i] for i in range(3, 6)])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [3])
+# scales
+@pytest.mark.parametrize(
+    "scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)]
+)
+# mode
+@pytest.mark.parametrize("mode", ["nearest"])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt):
+    # create models
+    resize_model1 = create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt)
+    resize_model2 = create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt)
+    resize_model3 = create_transpose_resize_transpose(
+        ifm_dim, ifm_ch, scales, mode, idt
+    )
+
+    # set initializers
+    resize_model1.set_initializer("scales", np.array(scales, dtype=np.float32))
+    resize_model2.set_initializer("scales", np.array(scales, dtype=np.float32))
+    resize_model3.set_initializer("scales", np.array(scales, dtype=np.float32))
+
+    # generate input tensor for testing
+    input_tensor_nchw = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]])
+    input_tensor_nhwc = gen_finn_dt_tensor(idt, [1, ifm_dim[0], ifm_dim[1], ifm_ch])
+    input_dict_nchw = {"inp": input_tensor_nchw}
+    input_dict_nhwc = {"inp": input_tensor_nhwc}
+
+    # execute first model
+    output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw)
+    expected1 = output_dict1["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model1 = resize_model1.transform(MakeScaleResizeNHWC())
+    resize_model1 = resize_model1.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name1 = resize_model1.graph.output[0].name
+    output_dict1 = oxe.execute_onnx(
+        resize_model1, input_dict_nchw, return_full_exec_context=False
+    )
+    output1 = output_dict1[output_node_name1]
+
+    # compare outputs
+    assert (expected1 == output1).all()
+    assert check_transform(resize_model1)
+
+    # execute second model
+    output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc)
+    expected2 = output_dict2["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model2 = resize_model2.transform(MakeScaleResizeNHWC())
+    resize_model2 = resize_model2.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name2 = resize_model2.graph.output[0].name
+    output_dict2 = oxe.execute_onnx(
+        resize_model2, input_dict_nhwc, return_full_exec_context=False
+    )
+    output2 = output_dict2[output_node_name2]
+
+    # compare outputs
+    assert (expected2 == output2).all()
+    assert check_transform(resize_model2)
+
+    # execute third model
+    output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc)
+    expected3 = output_dict3["outp"]
+
+    # transform Resize into ResizeNHWC
+    resize_model3 = resize_model3.transform(MakeScaleResizeNHWC())
+    resize_model3 = resize_model3.transform(InferDataLayouts())
+
+    # execute transformed model
+    output_node_name3 = resize_model3.graph.output[0].name
+    output_dict3 = oxe.execute_onnx(
+        resize_model3, input_dict_nhwc, return_full_exec_context=False
+    )
+    output3 = output_dict3[output_node_name3]
+
+    # compare outputs
+    assert (expected3 == output3).all()
+    assert check_transform(resize_model3)
diff --git a/tests/transformation/test_qonnx_to_finn.py b/tests/transformation/test_qonnx_to_finn.py
index 43055f6704732866569ac4770202f1b4ff6bfb22..7e438b4b8ba9d9befca79100bb9727735afa27d3 100644
--- a/tests/transformation/test_qonnx_to_finn.py
+++ b/tests/transformation/test_qonnx_to_finn.py
@@ -94,6 +94,9 @@ def analysis_testing_for_no_quant_nodes(model):
 @pytest.mark.parametrize("wbits", [1, 2])
 @pytest.mark.parametrize("model_name", ["TFC", "SFC", "LFC", "CNV", "mobilenet"])
 def test_QONNX_to_FINN(model_name, wbits, abits):
+    if model_name == "mobilenet":
+        pytest.xfail("MobileNet test is temporarily excluded from QONNX testing.")
+
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     if model_name == "LFC" and wbits == 2 and abits == 2:
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index cdf69aebddc4d6af2288774acbff5dd8a52512b3..915e54f4e4ba2648dc7dfafce520862b55b7a8da 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -30,6 +30,7 @@ import pkg_resources as pk
 
 import pytest
 
+import numpy as np
 import os
 from shutil import copytree
 
@@ -40,6 +41,7 @@ from finn.util.basic import make_build_dir
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.end2end
+@pytest.mark.xfail
 def test_end2end_build_dataflow_directory():
     test_dir = make_build_dir("test_build_dataflow_directory_")
     target_dir = test_dir + "/build_dataflow"
@@ -55,7 +57,6 @@ def test_end2end_build_dataflow_directory():
     assert os.path.isfile(output_dir + "/driver/driver.py")
     assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json")
     assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json")
-    assert os.path.isfile(output_dir + "/report/verify_rtlsim.vcd")
     assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd")
     assert os.path.isfile(
         output_dir + "/report/estimate_layer_config_alternatives.json"
@@ -68,8 +69,19 @@ def test_end2end_build_dataflow_directory():
     assert os.path.isfile(output_dir + "/report/post_synth_resources.xml")
     assert os.path.isfile(output_dir + "/report/post_route_timing.rpt")
     # verification outputs
-    verify_out_dir = output_dir + "/verification_output"
-    assert os.path.isfile(verify_out_dir + "/verify_initial_python_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_streamlined_python_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_folded_hls_cppsim_SUCCESS.npy")
-    assert os.path.isfile(verify_out_dir + "/verify_stitched_ip_rtlsim_SUCCESS.npy")
+    verif_batchsize = np.load(target_dir + "/input.npy").shape[0]
+    for i in range(verif_batchsize):
+        verify_out_dir = output_dir + "/verification_output"
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(
+            verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy"
+        )
+        assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd")
diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md
index 63ca6ac832c556b3e47a15fc3207683886796f23..2aaad0423b7d49c3d6760243fe1b1c1899b9030e 100644
--- a/tutorials/fpga_flow/README.md
+++ b/tutorials/fpga_flow/README.md
@@ -4,7 +4,7 @@ This example demonstrates how to bring a FINN compiled model into the Vivado FPG
 
 If you are new to the command-line flow, more information can be found [here](https://finn.readthedocs.io/en/latest/command_line.html).
 
-This demo was created using Vivado 2020.1.
+This demo was created using Vivado 2022.1.
 
 ## Compiling the Model in FINN
 
@@ -26,7 +26,7 @@ Prior to running, insure the following prerequisites have been met:
 - Install FINN and prerequisites.  The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this.
 - Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install.  For example:
 > export FINN_XILINX_PATH=/opt/Xilinx
-> export FINN_XILINX_VERSION=2020.1
+> export FINN_XILINX_VERSION=2022.1
 - Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo):
 > export FINN_ROOT=/home/foo/finn
 
@@ -112,7 +112,7 @@ testbench generators.
 
 There are any number of ways to bring the stitched IP into larger design.
 
-FINN already packages the stitched IP block design as a standalone IP-XACT component, which you can find under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip/ip`. You can add this to the list of IP repos and use it in your own Vivado designs. A good reference for this is [UG1119](https://www.xilinx.com/support/documentation/sw_manuals/xilinx2020_1/ug1119-vivado-creating-packaging-ip-tutorial.pdf)
+FINN already packages the stitched IP block design as a standalone IP-XACT component, which you can find under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip/ip`. You can add this to the list of IP repos and use it in your own Vivado designs. A good reference for this is [UG1119](https://www.xilinx.com/content/dam/xilinx/support/documents/sw_manuals/xilinx2022_1/ug1119-vivado-creating-packaging-ip-tutorial.pdf)
 
 Keep in mind that all of the User IP Repo's included in the Stitched IP project (from `$FINN_HOST_BUILD_DIR` which is normally located under `/tmp/finn_dev_<username>`) need to also be brought in as IP Repo's to any project using the stitched IP.  It would be prudent to copy those IP repos to an appropriate archive location. You should also set the
 `FINN_ROOT` environment variable to point to the compiler installation directory, as some of the build scripts will